Update prebuilt Clang to match Android kernel.
Bug: 132428451
Change-Id: I8f6e2cb23f381fc0c02ddea99b867e58e925e5be
diff --git a/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h b/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h
index b460cdc..1e9aeab 100644
--- a/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1,9 +1,8 @@
//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -80,6 +79,23 @@
using BaseT = TargetTransformInfoImplCRTPBase<T>;
using TTI = TargetTransformInfo;
+ /// Estimate a cost of Broadcast as an extract and sequence of insert
+ /// operations.
+ unsigned getBroadcastShuffleOverhead(Type *Ty) {
+ assert(Ty->isVectorTy() && "Can only shuffle vectors");
+ unsigned Cost = 0;
+ // Broadcast cost is equal to the cost of extracting the zero'th element
+ // plus the cost of inserting it into every element of the result vector.
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::ExtractElement, Ty, 0);
+
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::InsertElement, Ty, i);
+ }
+ return Cost;
+ }
+
/// Estimate a cost of shuffle as a sequence of extract and insert
/// operations.
unsigned getPermuteShuffleOverhead(Type *Ty) {
@@ -101,6 +117,50 @@
return Cost;
}
+ /// Estimate a cost of subvector extraction as a sequence of extract and
+ /// insert operations.
+ unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
+ assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
+ "Can only extract subvectors from vectors");
+ int NumSubElts = SubTy->getVectorNumElements();
+ assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
+ "SK_ExtractSubvector index out of range");
+
+ unsigned Cost = 0;
+ // Subvector extraction cost is equal to the cost of extracting element from
+ // the source type plus the cost of inserting them into the result vector
+ // type.
+ for (int i = 0; i != NumSubElts; ++i) {
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::ExtractElement, Ty, i + Index);
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::InsertElement, SubTy, i);
+ }
+ return Cost;
+ }
+
+ /// Estimate a cost of subvector insertion as a sequence of extract and
+ /// insert operations.
+ unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
+ assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
+ "Can only insert subvectors into vectors");
+ int NumSubElts = SubTy->getVectorNumElements();
+ assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
+ "SK_InsertSubvector index out of range");
+
+ unsigned Cost = 0;
+ // Subvector insertion cost is equal to the cost of extracting element from
+ // the source type plus the cost of inserting them into the result vector
+ // type.
+ for (int i = 0; i != NumSubElts; ++i) {
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::ExtractElement, SubTy, i);
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::InsertElement, Ty, i + Index);
+ }
+ return Cost;
+ }
+
/// Local query method delegates up to T which *must* implement this!
const TargetSubtargetInfo *getST() const {
return static_cast<const T *>(this)->getST();
@@ -554,14 +614,20 @@
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
switch (Kind) {
+ case TTI::SK_Broadcast:
+ return getBroadcastShuffleOverhead(Tp);
case TTI::SK_Select:
+ case TTI::SK_Reverse:
case TTI::SK_Transpose:
case TTI::SK_PermuteSingleSrc:
case TTI::SK_PermuteTwoSrc:
return getPermuteShuffleOverhead(Tp);
- default:
- return 1;
+ case TTI::SK_ExtractSubvector:
+ return getExtractSubvectorOverhead(Tp, Index, SubTp);
+ case TTI::SK_InsertSubvector:
+ return getInsertSubvectorOverhead(Tp, Index, SubTp);
}
+ llvm_unreachable("Unknown TTI::ShuffleKind");
}
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -783,8 +849,9 @@
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned Alignment, unsigned AddressSpace,
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false) {
VectorType *VT = dyn_cast<VectorType>(VecTy);
assert(VT && "Expect a vector type for interleaved memory op");
@@ -795,8 +862,13 @@
VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
// Firstly, the cost of load/store operation.
- unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
- Opcode, VecTy, Alignment, AddressSpace);
+ unsigned Cost;
+ if (UseMaskForCond || UseMaskForGaps)
+ Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
+ Opcode, VecTy, Alignment, AddressSpace);
+ else
+ Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
+ AddressSpace);
// Legalize the vector type, and get the legalized and unlegalized type
// sizes.
@@ -892,6 +964,40 @@
->getVectorInstrCost(Instruction::InsertElement, VT, i);
}
+ if (!UseMaskForCond)
+ return Cost;
+
+ Type *I8Type = Type::getInt8Ty(VT->getContext());
+ VectorType *MaskVT = VectorType::get(I8Type, NumElts);
+ SubVT = VectorType::get(I8Type, NumSubElts);
+
+ // The Mask shuffling cost is extract all the elements of the Mask
+ // and insert each of them Factor times into the wide vector:
+ //
+ // E.g. an interleaved group with factor 3:
+ // %mask = icmp ult <8 x i32> %vec1, %vec2
+ // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+ // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+ // The cost is estimated as extract all mask elements from the <8xi1> mask
+ // vector and insert them factor times into the <24xi1> shuffled mask
+ // vector.
+ for (unsigned i = 0; i < NumSubElts; i++)
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::ExtractElement, SubVT, i);
+
+ for (unsigned i = 0; i < NumElts; i++)
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::InsertElement, MaskVT, i);
+
+ // The Gaps mask is invariant and created outside the loop, therefore the
+ // cost of creating it is not accounted for here. However if we have both
+ // a MaskForGaps and some other mask that guards the execution of the
+ // memory access, we need to account for the cost of And-ing the two masks
+ // inside the loop.
+ if (UseMaskForGaps)
+ Cost += static_cast<T *>(this)->getArithmeticInstrCost(
+ BinaryOperator::And, MaskVT);
+
return Cost;
}
@@ -901,6 +1007,7 @@
unsigned VF = 1) {
unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+ auto *ConcreteTTI = static_cast<T *>(this);
switch (IID) {
default: {
@@ -926,29 +1033,24 @@
ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
}
- return static_cast<T *>(this)->
- getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
+ return ConcreteTTI->getIntrinsicInstrCost(IID, RetTy, Types, FMF,
+ ScalarizationCost);
}
case Intrinsic::masked_scatter: {
assert(VF == 1 && "Can't vectorize types here.");
Value *Mask = Args[3];
bool VarMask = !isa<Constant>(Mask);
unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
- return
- static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
- Args[0]->getType(),
- Args[1], VarMask,
- Alignment);
+ return ConcreteTTI->getGatherScatterOpCost(
+ Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment);
}
case Intrinsic::masked_gather: {
assert(VF == 1 && "Can't vectorize types here.");
Value *Mask = Args[2];
bool VarMask = !isa<Constant>(Mask);
unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
- return
- static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
- RetTy, Args[0], VarMask,
- Alignment);
+ return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy,
+ Args[0], VarMask, Alignment);
}
case Intrinsic::experimental_vector_reduce_add:
case Intrinsic::experimental_vector_reduce_mul:
@@ -964,6 +1066,45 @@
case Intrinsic::experimental_vector_reduce_umax:
case Intrinsic::experimental_vector_reduce_umin:
return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF);
+ case Intrinsic::fshl:
+ case Intrinsic::fshr: {
+ Value *X = Args[0];
+ Value *Y = Args[1];
+ Value *Z = Args[2];
+ TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
+ TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
+ TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
+ TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
+ TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
+ OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
+ : TTI::OP_None;
+ // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+ // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ unsigned Cost = 0;
+ Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Or, RetTy);
+ Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Sub, RetTy);
+ Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Shl, RetTy,
+ OpKindX, OpKindZ, OpPropsX);
+ Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
+ OpKindY, OpKindZ, OpPropsY);
+ // Non-constant shift amounts requires a modulo.
+ if (OpKindZ != TTI::OK_UniformConstantValue &&
+ OpKindZ != TTI::OK_NonUniformConstantValue)
+ Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
+ OpKindZ, OpKindBW, OpPropsZ,
+ OpPropsBW);
+ // For non-rotates (X != Y) we must add shift-by-zero handling costs.
+ if (X != Y) {
+ Type *CondTy = Type::getInt1Ty(RetTy->getContext());
+ if (RetVF > 1)
+ CondTy = VectorType::get(CondTy, RetVF);
+ Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
+ CondTy, nullptr);
+ Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+ CondTy, nullptr);
+ }
+ return Cost;
+ }
}
}
@@ -974,6 +1115,9 @@
unsigned getIntrinsicInstrCost(
Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
+ unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
+ auto *ConcreteTTI = static_cast<T *>(this);
+
SmallVector<unsigned, 2> ISDs;
unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
switch (IID) {
@@ -1002,8 +1146,8 @@
if (ScalarCalls == 1)
return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
- unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
- IID, ScalarRetTy, ScalarTys, FMF);
+ unsigned ScalarCost =
+ ConcreteTTI->getIntrinsicInstrCost(IID, ScalarRetTy, ScalarTys, FMF);
return ScalarCalls * ScalarCost + ScalarizationCost;
}
@@ -1042,12 +1186,12 @@
case Intrinsic::minnum:
ISDs.push_back(ISD::FMINNUM);
if (FMF.noNaNs())
- ISDs.push_back(ISD::FMINNAN);
+ ISDs.push_back(ISD::FMINIMUM);
break;
case Intrinsic::maxnum:
ISDs.push_back(ISD::FMAXNUM);
if (FMF.noNaNs())
- ISDs.push_back(ISD::FMAXNAN);
+ ISDs.push_back(ISD::FMAXIMUM);
break;
case Intrinsic::copysign:
ISDs.push_back(ISD::FCOPYSIGN);
@@ -1085,44 +1229,123 @@
case Intrinsic::sideeffect:
return 0;
case Intrinsic::masked_store:
- return static_cast<T *>(this)
- ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0);
+ return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0,
+ 0);
case Intrinsic::masked_load:
- return static_cast<T *>(this)
- ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
+ return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
case Intrinsic::experimental_vector_reduce_add:
- return static_cast<T *>(this)->getArithmeticReductionCost(
- Instruction::Add, Tys[0], /*IsPairwiseForm=*/false);
+ return ConcreteTTI->getArithmeticReductionCost(Instruction::Add, Tys[0],
+ /*IsPairwiseForm=*/false);
case Intrinsic::experimental_vector_reduce_mul:
- return static_cast<T *>(this)->getArithmeticReductionCost(
- Instruction::Mul, Tys[0], /*IsPairwiseForm=*/false);
+ return ConcreteTTI->getArithmeticReductionCost(Instruction::Mul, Tys[0],
+ /*IsPairwiseForm=*/false);
case Intrinsic::experimental_vector_reduce_and:
- return static_cast<T *>(this)->getArithmeticReductionCost(
- Instruction::And, Tys[0], /*IsPairwiseForm=*/false);
+ return ConcreteTTI->getArithmeticReductionCost(Instruction::And, Tys[0],
+ /*IsPairwiseForm=*/false);
case Intrinsic::experimental_vector_reduce_or:
- return static_cast<T *>(this)->getArithmeticReductionCost(
- Instruction::Or, Tys[0], /*IsPairwiseForm=*/false);
+ return ConcreteTTI->getArithmeticReductionCost(Instruction::Or, Tys[0],
+ /*IsPairwiseForm=*/false);
case Intrinsic::experimental_vector_reduce_xor:
- return static_cast<T *>(this)->getArithmeticReductionCost(
- Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false);
+ return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0],
+ /*IsPairwiseForm=*/false);
case Intrinsic::experimental_vector_reduce_fadd:
- return static_cast<T *>(this)->getArithmeticReductionCost(
- Instruction::FAdd, Tys[0], /*IsPairwiseForm=*/false);
+ return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0],
+ /*IsPairwiseForm=*/false);
case Intrinsic::experimental_vector_reduce_fmul:
- return static_cast<T *>(this)->getArithmeticReductionCost(
- Instruction::FMul, Tys[0], /*IsPairwiseForm=*/false);
+ return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0],
+ /*IsPairwiseForm=*/false);
case Intrinsic::experimental_vector_reduce_smax:
case Intrinsic::experimental_vector_reduce_smin:
case Intrinsic::experimental_vector_reduce_fmax:
case Intrinsic::experimental_vector_reduce_fmin:
- return static_cast<T *>(this)->getMinMaxReductionCost(
+ return ConcreteTTI->getMinMaxReductionCost(
Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
/*IsSigned=*/true);
case Intrinsic::experimental_vector_reduce_umax:
case Intrinsic::experimental_vector_reduce_umin:
- return static_cast<T *>(this)->getMinMaxReductionCost(
+ return ConcreteTTI->getMinMaxReductionCost(
Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
/*IsSigned=*/false);
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat: {
+ Type *CondTy = Type::getInt1Ty(RetTy->getContext());
+ if (RetVF > 1)
+ CondTy = VectorType::get(CondTy, RetVF);
+
+ Type *OpTy = StructType::create({RetTy, CondTy});
+ Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
+ ? Intrinsic::sadd_with_overflow
+ : Intrinsic::ssub_with_overflow;
+
+ // SatMax -> Overflow && SumDiff < 0
+ // SatMin -> Overflow && SumDiff >= 0
+ unsigned Cost = 0;
+ Cost += ConcreteTTI->getIntrinsicInstrCost(
+ OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed);
+ Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
+ CondTy, nullptr);
+ Cost += 2 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+ CondTy, nullptr);
+ return Cost;
+ }
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat: {
+ Type *CondTy = Type::getInt1Ty(RetTy->getContext());
+ if (RetVF > 1)
+ CondTy = VectorType::get(CondTy, RetVF);
+
+ Type *OpTy = StructType::create({RetTy, CondTy});
+ Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
+ ? Intrinsic::uadd_with_overflow
+ : Intrinsic::usub_with_overflow;
+
+ unsigned Cost = 0;
+ Cost += ConcreteTTI->getIntrinsicInstrCost(
+ OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed);
+ Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+ CondTy, nullptr);
+ return Cost;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow: {
+ Type *SumTy = RetTy->getContainedType(0);
+ Type *OverflowTy = RetTy->getContainedType(1);
+ unsigned Opcode = IID == Intrinsic::sadd_with_overflow
+ ? BinaryOperator::Add
+ : BinaryOperator::Sub;
+
+ // LHSSign -> LHS >= 0
+ // RHSSign -> RHS >= 0
+ // SumSign -> Sum >= 0
+ //
+ // Add:
+ // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+ // Sub:
+ // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+ unsigned Cost = 0;
+ Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy);
+ Cost += 3 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
+ OverflowTy, nullptr);
+ Cost += 2 * ConcreteTTI->getCmpSelInstrCost(
+ BinaryOperator::ICmp, OverflowTy, OverflowTy, nullptr);
+ Cost +=
+ ConcreteTTI->getArithmeticInstrCost(BinaryOperator::And, OverflowTy);
+ return Cost;
+ }
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow: {
+ Type *SumTy = RetTy->getContainedType(0);
+ Type *OverflowTy = RetTy->getContainedType(1);
+ unsigned Opcode = IID == Intrinsic::uadd_with_overflow
+ ? BinaryOperator::Add
+ : BinaryOperator::Sub;
+
+ unsigned Cost = 0;
+ Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy);
+ Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
+ OverflowTy, nullptr);
+ return Cost;
+ }
case Intrinsic::ctpop:
ISDs.push_back(ISD::CTPOP);
// In case of legalization use TCC_Expensive. This is cheaper than a
@@ -1163,17 +1386,16 @@
if (MinLegalCostI != LegalCost.end())
return *MinLegalCostI;
- auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end());
+ auto MinCustomCostI =
+ std::min_element(CustomCost.begin(), CustomCost.end());
if (MinCustomCostI != CustomCost.end())
return *MinCustomCostI;
// If we can't lower fmuladd into an FMA estimate the cost as a floating
// point mul followed by an add.
if (IID == Intrinsic::fmuladd)
- return static_cast<T *>(this)
- ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
- static_cast<T *>(this)
- ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
+ return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
+ ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
@@ -1191,7 +1413,7 @@
Ty = Ty->getScalarType();
ScalarTys.push_back(Ty);
}
- unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
+ unsigned ScalarCost = ConcreteTTI->getIntrinsicInstrCost(
IID, RetTy->getScalarType(), ScalarTys, FMF);
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
if (Tys[i]->isVectorTy()) {
@@ -1284,24 +1506,36 @@
LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
while (NumVecElts > MVTLen) {
NumVecElts /= 2;
+ Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
// Assume the pairwise shuffles add a cost.
ShuffleCost += (IsPairwise + 1) *
ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
- NumVecElts, Ty);
- ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
- Ty = VectorType::get(ScalarTy, NumVecElts);
+ NumVecElts, SubTy);
+ ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy);
+ Ty = SubTy;
++LongVectorCount;
}
+
+ NumReduxLevels -= LongVectorCount;
+
// The minimal length of the vector is limited by the real length of vector
// operations performed on the current platform. That's why several final
// reduction operations are performed on the vectors with the same
// architecture-dependent length.
- ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
- ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
- NumVecElts, Ty);
- ArithCost += (NumReduxLevels - LongVectorCount) *
+
+ // Non pairwise reductions need one shuffle per reduction level. Pairwise
+ // reductions need two shuffles on every level, but the last one. On that
+ // level one of the shuffles is <0, u, u, ...> which is identity.
+ unsigned NumShuffles = NumReduxLevels;
+ if (IsPairwise && NumReduxLevels >= 1)
+ NumShuffles += NumReduxLevels - 1;
+ ShuffleCost += NumShuffles *
+ ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+ 0, Ty);
+ ArithCost += NumReduxLevels *
ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
- return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+ return ShuffleCost + ArithCost +
+ ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
/// Try to calculate op costs for min/max reduction operations.
@@ -1331,37 +1565,46 @@
LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
while (NumVecElts > MVTLen) {
NumVecElts /= 2;
+ Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
+ CondTy = VectorType::get(ScalarCondTy, NumVecElts);
+
// Assume the pairwise shuffles add a cost.
ShuffleCost += (IsPairwise + 1) *
ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
- NumVecElts, Ty);
+ NumVecElts, SubTy);
MinMaxCost +=
- ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
- ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+ ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) +
+ ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
nullptr);
- Ty = VectorType::get(ScalarTy, NumVecElts);
- CondTy = VectorType::get(ScalarCondTy, NumVecElts);
+ Ty = SubTy;
++LongVectorCount;
}
+
+ NumReduxLevels -= LongVectorCount;
+
// The minimal length of the vector is limited by the real length of vector
// operations performed on the current platform. That's why several final
// reduction opertions are perfomed on the vectors with the same
// architecture-dependent length.
- ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
- ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
- NumVecElts, Ty);
+
+ // Non pairwise reductions need one shuffle per reduction level. Pairwise
+ // reductions need two shuffles on every level, but the last one. On that
+ // level one of the shuffles is <0, u, u, ...> which is identity.
+ unsigned NumShuffles = NumReduxLevels;
+ if (IsPairwise && NumReduxLevels >= 1)
+ NumShuffles += NumReduxLevels - 1;
+ ShuffleCost += NumShuffles *
+ ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+ 0, Ty);
MinMaxCost +=
- (NumReduxLevels - LongVectorCount) *
+ NumReduxLevels *
(ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
nullptr));
- // Need 3 extractelement instructions for scalarization + an additional
- // scalar select instruction.
+ // The last min/max should be in vector registers and we counted it above.
+ // So just need a single extractelement.
return ShuffleCost + MinMaxCost +
- 3 * getScalarizationOverhead(Ty, /*Insert=*/false,
- /*Extract=*/true) +
- ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- ScalarCondTy, nullptr);
+ ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
unsigned getVectorSplitCost() { return 1; }