Update prebuilt Clang to match Android kernel.

Bug: 132428451
Change-Id: I8f6e2cb23f381fc0c02ddea99b867e58e925e5be
diff --git a/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h b/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h
index b460cdc..1e9aeab 100644
--- a/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/linux-x64/clang/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1,9 +1,8 @@
 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -80,6 +79,23 @@
   using BaseT = TargetTransformInfoImplCRTPBase<T>;
   using TTI = TargetTransformInfo;
 
+  /// Estimate a cost of Broadcast as an extract and sequence of insert
+  /// operations.
+  unsigned getBroadcastShuffleOverhead(Type *Ty) {
+    assert(Ty->isVectorTy() && "Can only shuffle vectors");
+    unsigned Cost = 0;
+    // Broadcast cost is equal to the cost of extracting the zero'th element
+    // plus the cost of inserting it into every element of the result vector.
+    Cost += static_cast<T *>(this)->getVectorInstrCost(
+        Instruction::ExtractElement, Ty, 0);
+
+    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, Ty, i);
+    }
+    return Cost;
+  }
+
   /// Estimate a cost of shuffle as a sequence of extract and insert
   /// operations.
   unsigned getPermuteShuffleOverhead(Type *Ty) {
@@ -101,6 +117,50 @@
     return Cost;
   }
 
+  /// Estimate a cost of subvector extraction as a sequence of extract and
+  /// insert operations.
+  unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
+    assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
+           "Can only extract subvectors from vectors");
+    int NumSubElts = SubTy->getVectorNumElements();
+    assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
+           "SK_ExtractSubvector index out of range");
+
+    unsigned Cost = 0;
+    // Subvector extraction cost is equal to the cost of extracting element from
+    // the source type plus the cost of inserting them into the result vector
+    // type.
+    for (int i = 0; i != NumSubElts; ++i) {
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, Ty, i + Index);
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, SubTy, i);
+    }
+    return Cost;
+  }
+
+  /// Estimate a cost of subvector insertion as a sequence of extract and
+  /// insert operations.
+  unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
+    assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
+           "Can only insert subvectors into vectors");
+    int NumSubElts = SubTy->getVectorNumElements();
+    assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
+           "SK_InsertSubvector index out of range");
+
+    unsigned Cost = 0;
+    // Subvector insertion cost is equal to the cost of extracting element from
+    // the source type plus the cost of inserting them into the result vector
+    // type.
+    for (int i = 0; i != NumSubElts; ++i) {
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, SubTy, i);
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, Ty, i + Index);
+    }
+    return Cost;
+  }
+
   /// Local query method delegates up to T which *must* implement this!
   const TargetSubtargetInfo *getST() const {
     return static_cast<const T *>(this)->getST();
@@ -554,14 +614,20 @@
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                           Type *SubTp) {
     switch (Kind) {
+    case TTI::SK_Broadcast:
+      return getBroadcastShuffleOverhead(Tp);
     case TTI::SK_Select:
+    case TTI::SK_Reverse:
     case TTI::SK_Transpose:
     case TTI::SK_PermuteSingleSrc:
     case TTI::SK_PermuteTwoSrc:
       return getPermuteShuffleOverhead(Tp);
-    default:
-      return 1;
+    case TTI::SK_ExtractSubvector:
+      return getExtractSubvectorOverhead(Tp, Index, SubTp);
+    case TTI::SK_InsertSubvector:
+      return getInsertSubvectorOverhead(Tp, Index, SubTp);
     }
+    llvm_unreachable("Unknown TTI::ShuffleKind");
   }
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -783,8 +849,9 @@
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false) {
     VectorType *VT = dyn_cast<VectorType>(VecTy);
     assert(VT && "Expect a vector type for interleaved memory op");
 
@@ -795,8 +862,13 @@
     VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
 
     // Firstly, the cost of load/store operation.
-    unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
-        Opcode, VecTy, Alignment, AddressSpace);
+    unsigned Cost;
+    if (UseMaskForCond || UseMaskForGaps)
+      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
+          Opcode, VecTy, Alignment, AddressSpace);
+    else
+      Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
+                                                     AddressSpace);
 
     // Legalize the vector type, and get the legalized and unlegalized type
     // sizes.
@@ -892,6 +964,40 @@
                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
     }
 
+    if (!UseMaskForCond)
+      return Cost;
+
+    Type *I8Type = Type::getInt8Ty(VT->getContext());
+    VectorType *MaskVT = VectorType::get(I8Type, NumElts);
+    SubVT = VectorType::get(I8Type, NumSubElts);
+
+    // The Mask shuffling cost is extract all the elements of the Mask
+    // and insert each of them Factor times into the wide vector:
+    //
+    // E.g. an interleaved group with factor 3:
+    //    %mask = icmp ult <8 x i32> %vec1, %vec2
+    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+    // The cost is estimated as extract all mask elements from the <8xi1> mask
+    // vector and insert them factor times into the <24xi1> shuffled mask
+    // vector.
+    for (unsigned i = 0; i < NumSubElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, SubVT, i);
+
+    for (unsigned i = 0; i < NumElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, MaskVT, i);
+
+    // The Gaps mask is invariant and created outside the loop, therefore the
+    // cost of creating it is not accounted for here. However if we have both
+    // a MaskForGaps and some other mask that guards the execution of the
+    // memory access, we need to account for the cost of And-ing the two masks
+    // inside the loop.
+    if (UseMaskForGaps)
+      Cost += static_cast<T *>(this)->getArithmeticInstrCost(
+          BinaryOperator::And, MaskVT);
+
     return Cost;
   }
 
@@ -901,6 +1007,7 @@
                                  unsigned VF = 1) {
     unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
     assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+    auto *ConcreteTTI = static_cast<T *>(this);
 
     switch (IID) {
     default: {
@@ -926,29 +1033,24 @@
         ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
       }
 
-      return static_cast<T *>(this)->
-        getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
+      return ConcreteTTI->getIntrinsicInstrCost(IID, RetTy, Types, FMF,
+                                                ScalarizationCost);
     }
     case Intrinsic::masked_scatter: {
       assert(VF == 1 && "Can't vectorize types here.");
       Value *Mask = Args[3];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
-      return
-        static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
-                                                       Args[0]->getType(),
-                                                       Args[1], VarMask,
-                                                       Alignment);
+      return ConcreteTTI->getGatherScatterOpCost(
+          Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment);
     }
     case Intrinsic::masked_gather: {
       assert(VF == 1 && "Can't vectorize types here.");
       Value *Mask = Args[2];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
-      return
-        static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
-                                                       RetTy, Args[0], VarMask,
-                                                       Alignment);
+      return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy,
+                                                 Args[0], VarMask, Alignment);
     }
     case Intrinsic::experimental_vector_reduce_add:
     case Intrinsic::experimental_vector_reduce_mul:
@@ -964,6 +1066,45 @@
     case Intrinsic::experimental_vector_reduce_umax:
     case Intrinsic::experimental_vector_reduce_umin:
       return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF);
+    case Intrinsic::fshl:
+    case Intrinsic::fshr: {
+      Value *X = Args[0];
+      Value *Y = Args[1];
+      Value *Z = Args[2];
+      TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
+      TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
+      TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
+      TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
+      TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
+      OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
+                                                              : TTI::OP_None;
+      // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+      // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+      unsigned Cost = 0;
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Or, RetTy);
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Sub, RetTy);
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Shl, RetTy,
+                                                  OpKindX, OpKindZ, OpPropsX);
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
+                                                  OpKindY, OpKindZ, OpPropsY);
+      // Non-constant shift amounts requires a modulo.
+      if (OpKindZ != TTI::OK_UniformConstantValue &&
+          OpKindZ != TTI::OK_NonUniformConstantValue)
+        Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
+                                                    OpKindZ, OpKindBW, OpPropsZ,
+                                                    OpPropsBW);
+      // For non-rotates (X != Y) we must add shift-by-zero handling costs.
+      if (X != Y) {
+        Type *CondTy = Type::getInt1Ty(RetTy->getContext());
+        if (RetVF > 1)
+          CondTy = VectorType::get(CondTy, RetVF);
+        Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
+                                                CondTy, nullptr);
+        Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                                CondTy, nullptr);
+      }
+      return Cost;
+    }
     }
   }
 
@@ -974,6 +1115,9 @@
   unsigned getIntrinsicInstrCost(
       Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
       unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
+    unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
+    auto *ConcreteTTI = static_cast<T *>(this);
+
     SmallVector<unsigned, 2> ISDs;
     unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
     switch (IID) {
@@ -1002,8 +1146,8 @@
       if (ScalarCalls == 1)
         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
 
-      unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
-          IID, ScalarRetTy, ScalarTys, FMF);
+      unsigned ScalarCost =
+          ConcreteTTI->getIntrinsicInstrCost(IID, ScalarRetTy, ScalarTys, FMF);
 
       return ScalarCalls * ScalarCost + ScalarizationCost;
     }
@@ -1042,12 +1186,12 @@
     case Intrinsic::minnum:
       ISDs.push_back(ISD::FMINNUM);
       if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMINNAN);
+        ISDs.push_back(ISD::FMINIMUM);
       break;
     case Intrinsic::maxnum:
       ISDs.push_back(ISD::FMAXNUM);
       if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMAXNAN);
+        ISDs.push_back(ISD::FMAXIMUM);
       break;
     case Intrinsic::copysign:
       ISDs.push_back(ISD::FCOPYSIGN);
@@ -1085,44 +1229,123 @@
     case Intrinsic::sideeffect:
       return 0;
     case Intrinsic::masked_store:
-      return static_cast<T *>(this)
-          ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0);
+      return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0,
+                                                0);
     case Intrinsic::masked_load:
-      return static_cast<T *>(this)
-          ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
+      return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
     case Intrinsic::experimental_vector_reduce_add:
-      return static_cast<T *>(this)->getArithmeticReductionCost(
-          Instruction::Add, Tys[0], /*IsPairwiseForm=*/false);
+      return ConcreteTTI->getArithmeticReductionCost(Instruction::Add, Tys[0],
+                                                     /*IsPairwiseForm=*/false);
     case Intrinsic::experimental_vector_reduce_mul:
-      return static_cast<T *>(this)->getArithmeticReductionCost(
-          Instruction::Mul, Tys[0], /*IsPairwiseForm=*/false);
+      return ConcreteTTI->getArithmeticReductionCost(Instruction::Mul, Tys[0],
+                                                     /*IsPairwiseForm=*/false);
     case Intrinsic::experimental_vector_reduce_and:
-      return static_cast<T *>(this)->getArithmeticReductionCost(
-          Instruction::And, Tys[0], /*IsPairwiseForm=*/false);
+      return ConcreteTTI->getArithmeticReductionCost(Instruction::And, Tys[0],
+                                                     /*IsPairwiseForm=*/false);
     case Intrinsic::experimental_vector_reduce_or:
-      return static_cast<T *>(this)->getArithmeticReductionCost(
-          Instruction::Or, Tys[0], /*IsPairwiseForm=*/false);
+      return ConcreteTTI->getArithmeticReductionCost(Instruction::Or, Tys[0],
+                                                     /*IsPairwiseForm=*/false);
     case Intrinsic::experimental_vector_reduce_xor:
-      return static_cast<T *>(this)->getArithmeticReductionCost(
-          Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false);
+      return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0],
+                                                     /*IsPairwiseForm=*/false);
     case Intrinsic::experimental_vector_reduce_fadd:
-      return static_cast<T *>(this)->getArithmeticReductionCost(
-          Instruction::FAdd, Tys[0], /*IsPairwiseForm=*/false);
+      return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0],
+                                                     /*IsPairwiseForm=*/false);
     case Intrinsic::experimental_vector_reduce_fmul:
-      return static_cast<T *>(this)->getArithmeticReductionCost(
-          Instruction::FMul, Tys[0], /*IsPairwiseForm=*/false);
+      return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0],
+                                                     /*IsPairwiseForm=*/false);
     case Intrinsic::experimental_vector_reduce_smax:
     case Intrinsic::experimental_vector_reduce_smin:
     case Intrinsic::experimental_vector_reduce_fmax:
     case Intrinsic::experimental_vector_reduce_fmin:
-      return static_cast<T *>(this)->getMinMaxReductionCost(
+      return ConcreteTTI->getMinMaxReductionCost(
           Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
           /*IsSigned=*/true);
     case Intrinsic::experimental_vector_reduce_umax:
     case Intrinsic::experimental_vector_reduce_umin:
-      return static_cast<T *>(this)->getMinMaxReductionCost(
+      return ConcreteTTI->getMinMaxReductionCost(
           Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
           /*IsSigned=*/false);
+    case Intrinsic::sadd_sat:
+    case Intrinsic::ssub_sat: {
+      Type *CondTy = Type::getInt1Ty(RetTy->getContext());
+      if (RetVF > 1)
+        CondTy = VectorType::get(CondTy, RetVF);
+
+      Type *OpTy = StructType::create({RetTy, CondTy});
+      Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
+                                     ? Intrinsic::sadd_with_overflow
+                                     : Intrinsic::ssub_with_overflow;
+
+      // SatMax -> Overflow && SumDiff < 0
+      // SatMin -> Overflow && SumDiff >= 0
+      unsigned Cost = 0;
+      Cost += ConcreteTTI->getIntrinsicInstrCost(
+          OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed);
+      Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
+                                              CondTy, nullptr);
+      Cost += 2 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                                  CondTy, nullptr);
+      return Cost;
+    }
+    case Intrinsic::uadd_sat:
+    case Intrinsic::usub_sat: {
+      Type *CondTy = Type::getInt1Ty(RetTy->getContext());
+      if (RetVF > 1)
+        CondTy = VectorType::get(CondTy, RetVF);
+
+      Type *OpTy = StructType::create({RetTy, CondTy});
+      Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
+                                     ? Intrinsic::uadd_with_overflow
+                                     : Intrinsic::usub_with_overflow;
+
+      unsigned Cost = 0;
+      Cost += ConcreteTTI->getIntrinsicInstrCost(
+          OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed);
+      Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                              CondTy, nullptr);
+      return Cost;
+    }
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::ssub_with_overflow: {
+      Type *SumTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
+                            ? BinaryOperator::Add
+                            : BinaryOperator::Sub;
+
+      //   LHSSign -> LHS >= 0
+      //   RHSSign -> RHS >= 0
+      //   SumSign -> Sum >= 0
+      //
+      //   Add:
+      //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+      //   Sub:
+      //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+      unsigned Cost = 0;
+      Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy);
+      Cost += 3 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
+                                                  OverflowTy, nullptr);
+      Cost += 2 * ConcreteTTI->getCmpSelInstrCost(
+                      BinaryOperator::ICmp, OverflowTy, OverflowTy, nullptr);
+      Cost +=
+          ConcreteTTI->getArithmeticInstrCost(BinaryOperator::And, OverflowTy);
+      return Cost;
+    }
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::usub_with_overflow: {
+      Type *SumTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
+                            ? BinaryOperator::Add
+                            : BinaryOperator::Sub;
+
+      unsigned Cost = 0;
+      Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy);
+      Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
+                                              OverflowTy, nullptr);
+      return Cost;
+    }
     case Intrinsic::ctpop:
       ISDs.push_back(ISD::CTPOP);
       // In case of legalization use TCC_Expensive. This is cheaper than a
@@ -1163,17 +1386,16 @@
     if (MinLegalCostI != LegalCost.end())
       return *MinLegalCostI;
 
-    auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end());
+    auto MinCustomCostI =
+        std::min_element(CustomCost.begin(), CustomCost.end());
     if (MinCustomCostI != CustomCost.end())
       return *MinCustomCostI;
 
     // If we can't lower fmuladd into an FMA estimate the cost as a floating
     // point mul followed by an add.
     if (IID == Intrinsic::fmuladd)
-      return static_cast<T *>(this)
-                 ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
-             static_cast<T *>(this)
-                 ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
+      return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
+             ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
 
     // Else, assume that we need to scalarize this intrinsic. For math builtins
     // this will emit a costly libcall, adding call overhead and spills. Make it
@@ -1191,7 +1413,7 @@
           Ty = Ty->getScalarType();
         ScalarTys.push_back(Ty);
       }
-      unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
+      unsigned ScalarCost = ConcreteTTI->getIntrinsicInstrCost(
           IID, RetTy->getScalarType(), ScalarTys, FMF);
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         if (Tys[i]->isVectorTy()) {
@@ -1284,24 +1506,36 @@
         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
+      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                                 NumVecElts, Ty);
-      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
-      Ty = VectorType::get(ScalarTy, NumVecElts);
+                                                 NumVecElts, SubTy);
+      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy);
+      Ty = SubTy;
       ++LongVectorCount;
     }
+
+    NumReduxLevels -= LongVectorCount;
+
     // The minimal length of the vector is limited by the real length of vector
     // operations performed on the current platform. That's why several final
     // reduction operations are performed on the vectors with the same
     // architecture-dependent length.
-    ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
-                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                               NumVecElts, Ty);
-    ArithCost += (NumReduxLevels - LongVectorCount) *
+
+    // Non pairwise reductions need one shuffle per reduction level. Pairwise
+    // reductions need two shuffles on every level, but the last one. On that
+    // level one of the shuffles is <0, u, u, ...> which is identity.
+    unsigned NumShuffles = NumReduxLevels;
+    if (IsPairwise && NumReduxLevels >= 1)
+      NumShuffles += NumReduxLevels - 1;
+    ShuffleCost += NumShuffles *
+                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+                                               0, Ty);
+    ArithCost += NumReduxLevels *
                  ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
-    return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+    return ShuffleCost + ArithCost +
+           ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
   /// Try to calculate op costs for min/max reduction operations.
@@ -1331,37 +1565,46 @@
         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
+      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
+      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
+
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                                 NumVecElts, Ty);
+                                                 NumVecElts, SubTy);
       MinMaxCost +=
-          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
-          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) +
+          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
                                           nullptr);
-      Ty = VectorType::get(ScalarTy, NumVecElts);
-      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
+      Ty = SubTy;
       ++LongVectorCount;
     }
+
+    NumReduxLevels -= LongVectorCount;
+
     // The minimal length of the vector is limited by the real length of vector
     // operations performed on the current platform. That's why several final
     // reduction opertions are perfomed on the vectors with the same
     // architecture-dependent length.
-    ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
-                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                               NumVecElts, Ty);
+
+    // Non pairwise reductions need one shuffle per reduction level. Pairwise
+    // reductions need two shuffles on every level, but the last one. On that
+    // level one of the shuffles is <0, u, u, ...> which is identity.
+    unsigned NumShuffles = NumReduxLevels;
+    if (IsPairwise && NumReduxLevels >= 1)
+      NumShuffles += NumReduxLevels - 1;
+    ShuffleCost += NumShuffles *
+                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+                                               0, Ty);
     MinMaxCost +=
-        (NumReduxLevels - LongVectorCount) *
+        NumReduxLevels *
         (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
                                          nullptr));
-    // Need 3 extractelement instructions for scalarization + an additional
-    // scalar select instruction.
+    // The last min/max should be in vector registers and we counted it above.
+    // So just need a single extractelement.
     return ShuffleCost + MinMaxCost +
-           3 * getScalarizationOverhead(Ty, /*Insert=*/false,
-                                        /*Extract=*/true) +
-           ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                           ScalarCondTy, nullptr);
+           ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
   unsigned getVectorSplitCost() { return 1; }