From owner-svn-src-head@FreeBSD.ORG Mon Mar 23 21:15:08 2015 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id BD0CD1E2; Mon, 23 Mar 2015 21:15:08 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id A78C0E77; Mon, 23 Mar 2015 21:15:08 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id t2NLF8B3007369; Mon, 23 Mar 2015 21:15:08 GMT (envelope-from dim@FreeBSD.org) Received: (from dim@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id t2NLF8eC007368; Mon, 23 Mar 2015 21:15:08 GMT (envelope-from dim@FreeBSD.org) Message-Id: <201503232115.t2NLF8eC007368@svn.freebsd.org> X-Authentication-Warning: svn.freebsd.org: dim set sender to dim@FreeBSD.org using -f From: Dimitry Andric Date: Mon, 23 Mar 2015 21:15:08 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r280401 - head/contrib/llvm/patches X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 23 Mar 2015 21:15:08 -0000 Author: dim Date: Mon Mar 23 21:15:07 2015 New Revision: 280401 URL: https://svnweb.freebsd.org/changeset/base/280401 Log: Add llvm patch corresponding to r280400. Added: head/contrib/llvm/patches/patch-10-llvm-r230348-arm-fix-bad-ha.diff Added: head/contrib/llvm/patches/patch-10-llvm-r230348-arm-fix-bad-ha.diff ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/contrib/llvm/patches/patch-10-llvm-r230348-arm-fix-bad-ha.diff Mon Mar 23 21:15:07 2015 (r280401) @@ -0,0 +1,419 @@ +Pull in r230348 from upstream llvm trunk (by Tim Northover): + + ARM: treat [N x i32] and [N x i64] as AAPCS composite types + + The logic is almost there already, with our special homogeneous + aggregate handling. Tweaking it like this allows front-ends to emit + AAPCS compliant code without ever having to count registers or add + discarded padding arguments. + + Only arrays of i32 and i64 are needed to model AAPCS rules, but I + decided to apply the logic to all integer arrays for more consistency. + +This fixes a possible "Unexpected member type for HA" error when +compiling lib/msun/bsdsrc/b_tgamma.c for armv6. + +Reported by: Jakub Palider + +Introduced here: https://svnweb.freebsd.org/changeset/base/280400 + +Index: include/llvm/CodeGen/CallingConvLower.h +=================================================================== +--- include/llvm/CodeGen/CallingConvLower.h ++++ include/llvm/CodeGen/CallingConvLower.h +@@ -122,8 +122,8 @@ class CCValAssign { + // There is no need to differentiate between a pending CCValAssign and other + // kinds, as they are stored in a different list. + static CCValAssign getPending(unsigned ValNo, MVT ValVT, MVT LocVT, +- LocInfo HTP) { +- return getReg(ValNo, ValVT, 0, LocVT, HTP); ++ LocInfo HTP, unsigned ExtraInfo = 0) { ++ return getReg(ValNo, ValVT, ExtraInfo, LocVT, HTP); + } + + void convertToReg(unsigned RegNo) { +@@ -146,6 +146,7 @@ class CCValAssign { + + unsigned getLocReg() const { assert(isRegLoc()); return Loc; } + unsigned getLocMemOffset() const { assert(isMemLoc()); return Loc; } ++ unsigned getExtraInfo() const { return Loc; } + MVT getLocVT() const { return LocVT; } + + LocInfo getLocInfo() const { return HTP; } +Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +=================================================================== +--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp ++++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +@@ -7429,11 +7429,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo + } + if (Args[i].isNest) + Flags.setNest(); +- if (NeedsRegBlock) { ++ if (NeedsRegBlock) + Flags.setInConsecutiveRegs(); +- if (Value == NumValues - 1) +- Flags.setInConsecutiveRegsLast(); +- } + Flags.setOrigAlign(OriginalAlignment); + + MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT); +@@ -7482,6 +7479,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo + CLI.Outs.push_back(MyFlags); + CLI.OutVals.push_back(Parts[j]); + } ++ ++ if (NeedsRegBlock && Value == NumValues - 1) ++ CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast(); + } + } + +@@ -7696,11 +7696,8 @@ void SelectionDAGISel::LowerArguments(const Functi + } + if (F.getAttributes().hasAttribute(Idx, Attribute::Nest)) + Flags.setNest(); +- if (NeedsRegBlock) { ++ if (NeedsRegBlock) + Flags.setInConsecutiveRegs(); +- if (Value == NumValues - 1) +- Flags.setInConsecutiveRegsLast(); +- } + Flags.setOrigAlign(OriginalAlignment); + + MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT); +@@ -7715,6 +7712,8 @@ void SelectionDAGISel::LowerArguments(const Functi + MyFlags.Flags.setOrigAlign(1); + Ins.push_back(MyFlags); + } ++ if (NeedsRegBlock && Value == NumValues - 1) ++ Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast(); + PartBase += VT.getStoreSize(); + } + } +Index: lib/Target/ARM/ARMCallingConv.h +=================================================================== +--- lib/Target/ARM/ARMCallingConv.h ++++ lib/Target/ARM/ARMCallingConv.h +@@ -160,6 +160,8 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &V + State); + } + ++static const uint16_t RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; ++ + static const uint16_t SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, +@@ -168,81 +170,114 @@ static const uint16_t DRegList[] = { ARM::D0, ARM: + ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; + static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; + ++ + // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA + // has InConsecutiveRegs set, and that the last member also has + // InConsecutiveRegsLast set. We must process all members of the HA before + // we can allocate it, as we need to know the total number of registers that + // will be needed in order to (attempt to) allocate a contiguous block. +-static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT, +- CCValAssign::LocInfo &LocInfo, +- ISD::ArgFlagsTy &ArgFlags, CCState &State) { +- SmallVectorImpl &PendingHAMembers = State.getPendingLocs(); ++static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, ++ MVT &LocVT, ++ CCValAssign::LocInfo &LocInfo, ++ ISD::ArgFlagsTy &ArgFlags, ++ CCState &State) { ++ SmallVectorImpl &PendingMembers = State.getPendingLocs(); + + // AAPCS HFAs must have 1-4 elements, all of the same type +- assert(PendingHAMembers.size() < 4); +- if (PendingHAMembers.size() > 0) +- assert(PendingHAMembers[0].getLocVT() == LocVT); ++ if (PendingMembers.size() > 0) ++ assert(PendingMembers[0].getLocVT() == LocVT); + + // Add the argument to the list to be allocated once we know the size of the +- // HA +- PendingHAMembers.push_back( +- CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); ++ // aggregate. Store the type's required alignmnent as extra info for later: in ++ // the [N x i64] case all trace has been removed by the time we actually get ++ // to do allocation. ++ PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo, ++ ArgFlags.getOrigAlign())); + +- if (ArgFlags.isInConsecutiveRegsLast()) { +- assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 4 && +- "Homogeneous aggregates must have between 1 and 4 members"); ++ if (!ArgFlags.isInConsecutiveRegsLast()) ++ return true; + +- // Try to allocate a contiguous block of registers, each of the correct +- // size to hold one member. +- ArrayRef RegList; +- switch (LocVT.SimpleTy) { +- case MVT::f32: +- RegList = SRegList; +- break; +- case MVT::f64: +- RegList = DRegList; +- break; +- case MVT::v2f64: +- RegList = QRegList; +- break; +- default: +- llvm_unreachable("Unexpected member type for HA"); +- break; +- } ++ // Try to allocate a contiguous block of registers, each of the correct ++ // size to hold one member. ++ unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U); + +- unsigned RegResult = +- State.AllocateRegBlock(RegList, PendingHAMembers.size()); ++ ArrayRef RegList; ++ switch (LocVT.SimpleTy) { ++ case MVT::i32: { ++ RegList = RRegList; ++ unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size()); + +- if (RegResult) { +- for (SmallVectorImpl::iterator It = PendingHAMembers.begin(); +- It != PendingHAMembers.end(); ++It) { +- It->convertToReg(RegResult); +- State.addLoc(*It); +- ++RegResult; +- } +- PendingHAMembers.clear(); +- return true; +- } ++ // First consume all registers that would give an unaligned object. Whether ++ // we go on stack or in regs, no-one will be using them in future. ++ unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4; ++ while (RegIdx % RegAlign != 0 && RegIdx < RegList.size()) ++ State.AllocateReg(RegList[RegIdx++]); + +- // Register allocation failed, fall back to the stack ++ break; ++ } ++ case MVT::f32: ++ RegList = SRegList; ++ break; ++ case MVT::f64: ++ RegList = DRegList; ++ break; ++ case MVT::v2f64: ++ RegList = QRegList; ++ break; ++ default: ++ llvm_unreachable("Unexpected member type for block aggregate"); ++ break; ++ } + +- // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp) +- for (unsigned regNo = 0; regNo < 16; ++regNo) +- State.AllocateReg(SRegList[regNo]); ++ unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); ++ if (RegResult) { ++ for (SmallVectorImpl::iterator It = PendingMembers.begin(); ++ It != PendingMembers.end(); ++It) { ++ It->convertToReg(RegResult); ++ State.addLoc(*It); ++ ++RegResult; ++ } ++ PendingMembers.clear(); ++ return true; ++ } + +- unsigned Size = LocVT.getSizeInBits() / 8; +- unsigned Align = std::min(Size, 8U); ++ // Register allocation failed, we'll be needing the stack ++ unsigned Size = LocVT.getSizeInBits() / 8; ++ if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) { ++ // If nothing else has used the stack until this point, a non-HFA aggregate ++ // can be split between regs and stack. ++ unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size()); ++ for (auto &It : PendingMembers) { ++ if (RegIdx >= RegList.size()) ++ It.convertToMem(State.AllocateStack(Size, Size)); ++ else ++ It.convertToReg(State.AllocateReg(RegList[RegIdx++])); + +- for (auto It : PendingHAMembers) { +- It.convertToMem(State.AllocateStack(Size, Align)); + State.addLoc(It); + } ++ PendingMembers.clear(); ++ return true; ++ } else if (LocVT != MVT::i32) ++ RegList = SRegList; + +- // All pending members have now been allocated +- PendingHAMembers.clear(); ++ // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core) ++ for (auto Reg : RegList) ++ State.AllocateReg(Reg); ++ ++ for (auto &It : PendingMembers) { ++ It.convertToMem(State.AllocateStack(Size, Align)); ++ State.addLoc(It); ++ ++ // After the first item has been allocated, the rest are packed as tightly ++ // as possible. (E.g. an incoming i64 would have starting Align of 8, but ++ // we'll be allocating a bunch of i32 slots). ++ Align = Size; + } + +- // This will be allocated by the last member of the HA ++ // All pending members have now been allocated ++ PendingMembers.clear(); ++ ++ // This will be allocated by the last member of the aggregate + return true; + } + +Index: lib/Target/ARM/ARMCallingConv.td +=================================================================== +--- lib/Target/ARM/ARMCallingConv.td ++++ lib/Target/ARM/ARMCallingConv.td +@@ -175,7 +175,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + + // HFAs are passed in a contiguous block of registers, or on the stack +- CCIfConsecutiveRegs>, ++ CCIfConsecutiveRegs>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, +Index: lib/Target/ARM/ARMISelLowering.cpp +=================================================================== +--- lib/Target/ARM/ARMISelLowering.cpp ++++ lib/Target/ARM/ARMISelLowering.cpp +@@ -11280,7 +11280,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABas + return (Members > 0 && Members <= 4); + } + +-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate. ++/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of ++/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when ++/// passing according to AAPCS rules. + bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { + if (getEffectiveCallingConv(CallConv, isVarArg) != +@@ -11289,7 +11291,9 @@ bool ARMTargetLowering::functionArgumentNeedsConse + + HABaseType Base = HA_UNKNOWN; + uint64_t Members = 0; +- bool result = isHomogeneousAggregate(Ty, Base, Members); +- DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump()); +- return result; ++ bool IsHA = isHomogeneousAggregate(Ty, Base, Members); ++ DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); ++ ++ bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); ++ return IsHA || IsIntArray; + } +Index: test/CodeGen/ARM/aggregate-padding.ll +=================================================================== +--- test/CodeGen/ARM/aggregate-padding.ll ++++ test/CodeGen/ARM/aggregate-padding.ll +@@ -0,0 +1,101 @@ ++; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s ++ ++; [2 x i64] should be contiguous when split (e.g. we shouldn't try to align all ++; i32 components to 64 bits). Also makes sure i64 based types are properly ++; aligned on the stack. ++define i64 @test_i64_contiguous_on_stack([8 x double], float, i32 %in, [2 x i64] %arg) nounwind { ++; CHECK-LABEL: test_i64_contiguous_on_stack: ++; CHECK-DAG: ldr [[LO0:r[0-9]+]], [sp, #8] ++; CHECK-DAG: ldr [[HI0:r[0-9]+]], [sp, #12] ++; CHECK-DAG: ldr [[LO1:r[0-9]+]], [sp, #16] ++; CHECK-DAG: ldr [[HI1:r[0-9]+]], [sp, #20] ++; CHECK: adds r0, [[LO0]], [[LO1]] ++; CHECK: adc r1, [[HI0]], [[HI1]] ++ ++ %val1 = extractvalue [2 x i64] %arg, 0 ++ %val2 = extractvalue [2 x i64] %arg, 1 ++ %sum = add i64 %val1, %val2 ++ ret i64 %sum ++} ++ ++; [2 x i64] should try to use looks for 4 regs, not 8 (which might happen if the ++; i64 -> i32, i32 split wasn't handled correctly). ++define i64 @test_2xi64_uses_4_regs([8 x double], float, [2 x i64] %arg) nounwind { ++; CHECK-LABEL: test_2xi64_uses_4_regs: ++; CHECK-DAG: mov r0, r2 ++; CHECK-DAG: mov r1, r3 ++ ++ %val = extractvalue [2 x i64] %arg, 1 ++ ret i64 %val ++} ++ ++; An aggregate should be able to split between registers and stack if there is ++; nothing else on the stack. ++define i32 @test_aggregates_split([8 x double], i32, [4 x i32] %arg) nounwind { ++; CHECK-LABEL: test_aggregates_split: ++; CHECK: ldr [[VAL3:r[0-9]+]], [sp] ++; CHECK: add r0, r1, [[VAL3]] ++ ++ %val0 = extractvalue [4 x i32] %arg, 0 ++ %val3 = extractvalue [4 x i32] %arg, 3 ++ %sum = add i32 %val0, %val3 ++ ret i32 %sum ++} ++ ++; If an aggregate has to be moved entirely onto the stack, nothing should be ++; able to use r0-r3 any more. Also checks that [2 x i64] properly aligned when ++; it uses regs. ++define i32 @test_no_int_backfilling([8 x double], float, i32, [2 x i64], i32 %arg) nounwind { ++; CHECK-LABEL: test_no_int_backfilling: ++; CHECK: ldr r0, [sp, #24] ++ ret i32 %arg ++} ++ ++; Even if the argument was successfully allocated as reg block, there should be ++; no backfillig to r1. ++define i32 @test_no_int_backfilling_regsonly(i32, [1 x i64], i32 %arg) { ++; CHECK-LABEL: test_no_int_backfilling_regsonly: ++; CHECK: ldr r0, [sp] ++ ret i32 %arg ++} ++ ++; If an aggregate has to be moved entirely onto the stack, nothing should be ++; able to use r0-r3 any more. ++define float @test_no_float_backfilling([7 x double], [4 x i32], i32, [4 x double], float %arg) nounwind { ++; CHECK-LABEL: test_no_float_backfilling: ++; CHECK: vldr s0, [sp, #40] ++ ret float %arg ++} ++ ++; They're a bit pointless, but types like [N x i8] should work as well. ++define i8 @test_i8_in_regs(i32, [3 x i8] %arg) { ++; CHECK-LABEL: test_i8_in_regs: ++; CHECK: add r0, r1, r3 ++ %val0 = extractvalue [3 x i8] %arg, 0 ++ %val2 = extractvalue [3 x i8] %arg, 2 ++ %sum = add i8 %val0, %val2 ++ ret i8 %sum ++} ++ ++define i16 @test_i16_split(i32, i32, [3 x i16] %arg) { ++; CHECK-LABEL: test_i16_split: ++; CHECK: ldrh [[VAL2:r[0-9]+]], [sp] ++; CHECK: add r0, r2, [[VAL2]] ++ %val0 = extractvalue [3 x i16] %arg, 0 ++ %val2 = extractvalue [3 x i16] %arg, 2 ++ %sum = add i16 %val0, %val2 ++ ret i16 %sum ++} ++ ++; Beware: on the stack each i16 still gets a 32-bit slot, the array is not ++; packed. ++define i16 @test_i16_forced_stack([8 x double], double, i32, i32, [3 x i16] %arg) { ++; CHECK-LABEL: test_i16_forced_stack: ++; CHECK-DAG: ldrh [[VAL0:r[0-9]+]], [sp, #8] ++; CHECK-DAG: ldrh [[VAL2:r[0-9]+]], [sp, #16] ++; CHECK: add r0, [[VAL0]], [[VAL2]] ++ %val0 = extractvalue [3 x i16] %arg, 0 ++ %val2 = extractvalue [3 x i16] %arg, 2 ++ %sum = add i16 %val0, %val2 ++ ret i16 %sum ++}