//===---HexagonLoadStoreWidening.cpp---------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// HexagonStoreWidening:
// Replace sequences of "narrow" stores to adjacent memory locations with
// a fewer "wide" stores that have the same effect.
// For example, replace:
//   S4_storeirb_io  %100, 0, 0   ; store-immediate-byte
//   S4_storeirb_io  %100, 1, 0   ; store-immediate-byte
// with
//   S4_storeirh_io  %100, 0, 0   ; store-immediate-halfword
// The above is the general idea.  The actual cases handled by the code
// may be a bit more complex.
// The purpose of this pass is to reduce the number of outstanding stores,
// or as one could say, "reduce store queue pressure".  Also, wide stores
// mean fewer stores, and since there are only two memory instructions allowed
// per packet, it also means fewer packets, and ultimately fewer cycles.
//
// HexagonLoadWidening does the same thing as HexagonStoreWidening but
// for Loads. Here, we try to replace 4-byte Loads with register-pair loads.
// For example:
// Replace
//   %2:intregs = L2_loadri_io %1:intregs, 0 :: (load (s32) from %ptr1, align 8)
//   %3:intregs = L2_loadri_io %1:intregs, 4 :: (load (s32) from %ptr2)
// with
//   %4:doubleregs = L2_loadrd_io %1:intregs, 0 :: (load (s64) from %ptr1)
//   %2:intregs = COPY %4.isub_lo:doubleregs
//   %3:intregs = COPY %4.isub_hi:doubleregs
//
// LoadWidening for 8 and 16-bit loads is not useful as we end up generating 2N
// insts to replace N loads: 1 widened load, N bitwise and, N - 1 shifts

//===---------------------------------------------------------------------===//

#include "Hexagon.h"
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>

using namespace llvm;

#define DEBUG_TYPE "hexagon-load-store-widening"

static cl::opt<unsigned> MaxMBBSizeForLoadStoreWidening(
    "max-bb-size-for-load-store-widening", cl::Hidden, cl::init(1000),
    cl::desc("Limit block size to analyze in load/store widening pass"));

namespace {

struct HexagonLoadStoreWidening {
  enum WideningMode { Store, Load };
  const HexagonInstrInfo *TII;
  const HexagonRegisterInfo *TRI;
  MachineRegisterInfo *MRI;
  AliasAnalysis *AA;
  MachineFunction *MF;

public:
  HexagonLoadStoreWidening(const HexagonInstrInfo *TII,
                           const HexagonRegisterInfo *TRI,
                           MachineRegisterInfo *MRI, AliasAnalysis *AA,
                           MachineFunction *MF, bool StoreMode)
      : TII(TII), TRI(TRI), MRI(MRI), AA(AA), MF(MF),
        Mode(StoreMode ? WideningMode::Store : WideningMode::Load),
        HII(MF->getSubtarget<HexagonSubtarget>().getInstrInfo()) {}

  bool run();

private:
  const bool Mode;
  const unsigned MaxWideSize = 8;
  const HexagonInstrInfo *HII = nullptr;

  using InstrSet = SmallPtrSet<MachineInstr *, 16>;
  using InstrGroup = SmallVector<MachineInstr *, 8>;
  using InstrGroupList = SmallVector<InstrGroup, 8>;

  InstrSet ProcessedInsts;

  unsigned getBaseAddressRegister(const MachineInstr *MI);
  int64_t getOffset(const MachineInstr *MI);
  int64_t getPostIncrementValue(const MachineInstr *MI);
  bool handledInstType(const MachineInstr *MI);

  void createGroup(MachineInstr *BaseInst, InstrGroup &Group);
  void createGroups(MachineBasicBlock &MBB, InstrGroupList &StoreGroups);
  bool processBasicBlock(MachineBasicBlock &MBB);
  bool processGroup(InstrGroup &Group);
  bool selectInsts(InstrGroup::iterator Begin, InstrGroup::iterator End,
                   InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize);
  bool createWideInsts(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
  bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
  bool createWideLoads(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
  bool replaceInsts(InstrGroup &OG, InstrGroup &NG);
  bool areAdjacent(const MachineInstr *S1, const MachineInstr *S2);
  bool canSwapInstructions(const MachineInstr *A, const MachineInstr *B);
};

struct HexagonStoreWidening : public MachineFunctionPass {
  static char ID;

  HexagonStoreWidening() : MachineFunctionPass(ID) {}

  StringRef getPassName() const override { return "Hexagon Store Widening"; }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<AAResultsWrapperPass>();
    AU.addPreserved<AAResultsWrapperPass>();
    MachineFunctionPass::getAnalysisUsage(AU);
  }

  bool runOnMachineFunction(MachineFunction &MFn) override {
    if (skipFunction(MFn.getFunction()))
      return false;

    auto &ST = MFn.getSubtarget<HexagonSubtarget>();
    const HexagonInstrInfo *TII = ST.getInstrInfo();
    const HexagonRegisterInfo *TRI = ST.getRegisterInfo();
    MachineRegisterInfo *MRI = &MFn.getRegInfo();
    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

    return HexagonLoadStoreWidening(TII, TRI, MRI, AA, &MFn, true).run();
  }
};

struct HexagonLoadWidening : public MachineFunctionPass {
  static char ID;

  HexagonLoadWidening() : MachineFunctionPass(ID) {}

  StringRef getPassName() const override { return "Hexagon Load Widening"; }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<AAResultsWrapperPass>();
    AU.addPreserved<AAResultsWrapperPass>();
    MachineFunctionPass::getAnalysisUsage(AU);
  }

  bool runOnMachineFunction(MachineFunction &MFn) override {
    if (skipFunction(MFn.getFunction()))
      return false;

    auto &ST = MFn.getSubtarget<HexagonSubtarget>();
    const HexagonInstrInfo *TII = ST.getInstrInfo();
    const HexagonRegisterInfo *TRI = ST.getRegisterInfo();
    MachineRegisterInfo *MRI = &MFn.getRegInfo();
    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
    return HexagonLoadStoreWidening(TII, TRI, MRI, AA, &MFn, false).run();
  }
};

char HexagonStoreWidening::ID = 0;
char HexagonLoadWidening::ID = 0;

} // end anonymous namespace

INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores",
                      "Hexagon Store Widening", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores",
                    "Hexagon Store Widening", false, false)

INITIALIZE_PASS_BEGIN(HexagonLoadWidening, "hexagon-widen-loads",
                      "Hexagon Load Widening", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(HexagonLoadWidening, "hexagon-widen-loads",
                    "Hexagon Load Widening", false, false)

static const MachineMemOperand &getMemTarget(const MachineInstr *MI) {
  assert(!MI->memoperands_empty() && "Expecting memory operands");
  return **MI->memoperands_begin();
}

unsigned
HexagonLoadStoreWidening::getBaseAddressRegister(const MachineInstr *MI) {
  assert(HexagonLoadStoreWidening::handledInstType(MI) && "Unhandled opcode");
  unsigned Base, Offset;
  HII->getBaseAndOffsetPosition(*MI, Base, Offset);
  const MachineOperand &MO = MI->getOperand(Base);
  assert(MO.isReg() && "Expecting register operand");
  return MO.getReg();
}

int64_t HexagonLoadStoreWidening::getOffset(const MachineInstr *MI) {
  assert(HexagonLoadStoreWidening::handledInstType(MI) && "Unhandled opcode");

  // On Hexagon, post-incs always have an offset of 0
  // There is no Offset operand to post-incs
  if (HII->isPostIncrement(*MI))
    return 0;

  unsigned Base, Offset;

  HII->getBaseAndOffsetPosition(*MI, Base, Offset);
  const MachineOperand &MO = MI->getOperand(Offset);
  switch (MO.getType()) {
  case MachineOperand::MO_Immediate:
    return MO.getImm();
  case MachineOperand::MO_GlobalAddress:
    return MO.getOffset();
  default:
    break;
  }
  llvm_unreachable("Expecting an immediate or global operand");
}

inline int64_t
HexagonLoadStoreWidening::getPostIncrementValue(const MachineInstr *MI) {
  unsigned Base, PostIncIdx;
  HII->getBaseAndOffsetPosition(*MI, Base, PostIncIdx);
  const MachineOperand &MO = MI->getOperand(PostIncIdx);
  return MO.getImm();
}

// Filtering function: any loads/stores whose opcodes are not "approved" of by
// this function will not be subjected to widening.
inline bool HexagonLoadStoreWidening::handledInstType(const MachineInstr *MI) {
  unsigned Opc = MI->getOpcode();
  if (Mode == WideningMode::Store) {
    switch (Opc) {
    case Hexagon::S4_storeirb_io:
    case Hexagon::S4_storeirh_io:
    case Hexagon::S4_storeiri_io:
    case Hexagon::S2_storeri_io:
      // Base address must be a register. (Implement FI later.)
      return MI->getOperand(0).isReg();
    case Hexagon::S2_storeri_pi:
      return MI->getOperand(1).isReg();
    }
  } else {
    // LoadWidening for 8 and 16 bit loads needs 2x instructions to replace x
    // loads. So we only widen 32 bit loads as we don't need to select the
    // right bits with AND & SHIFT ops.
    switch (Opc) {
    case Hexagon::L2_loadri_io:
      // Base address must be a register and offset must be immediate.
      return !MI->memoperands_empty() && MI->getOperand(1).isReg() &&
             MI->getOperand(2).isImm();
    case Hexagon::L2_loadri_pi:
      return !MI->memoperands_empty() && MI->getOperand(2).isReg();
    }
  }
  return false;
}

static void addDefsUsesToList(const MachineInstr *MI,
                              DenseSet<Register> &RegDefs,
                              DenseSet<Register> &RegUses) {
  for (const auto &Op : MI->operands()) {
    if (!Op.isReg())
      continue;
    if (Op.isDef())
      RegDefs.insert(Op.getReg());
    if (Op.readsReg())
      RegUses.insert(Op.getReg());
  }
}

bool HexagonLoadStoreWidening::canSwapInstructions(const MachineInstr *A,
                                                   const MachineInstr *B) {
  DenseSet<Register> ARegDefs;
  DenseSet<Register> ARegUses;
  addDefsUsesToList(A, ARegDefs, ARegUses);
  if (A->mayLoadOrStore() && B->mayLoadOrStore() &&
      (A->mayStore() || B->mayStore()) && A->mayAlias(AA, *B, true))
    return false;
  for (const auto &BOp : B->operands()) {
    if (!BOp.isReg())
      continue;
    if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
      return false;
    if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
      return false;
  }
  return true;
}

// Inspect a machine basic block, and generate groups out of loads/stores
// encountered in the block.
//
// A load/store group is a group of loads or stores that use the same base
// register, and which can be reordered within that group without altering the
// semantics of the program.  A single group could be widened as
// a whole, if there existed a single load/store instruction with the same
// semantics as the entire group.  In many cases, a single group may need more
// than one wide load or store.
void HexagonLoadStoreWidening::createGroups(MachineBasicBlock &MBB,
                                            InstrGroupList &StoreGroups) {
  // Traverse all instructions and if we encounter
  // a load/store, then try to create a group starting at that instruction
  // i.e. a sequence of independent loads/stores that can be widened.
  for (auto I = MBB.begin(); I != MBB.end(); ++I) {
    MachineInstr *MI = &(*I);
    if (!handledInstType(MI))
      continue;
    if (ProcessedInsts.count(MI))
      continue;

    // Found a store.  Try to create a store group.
    InstrGroup G;
    createGroup(MI, G);
    if (G.size() > 1)
      StoreGroups.push_back(G);
  }
}

// Create a single load/store group.  The insts need to be independent between
// themselves, and also there cannot be other instructions between them
// that could read or modify storage being read from or stored into.
void HexagonLoadStoreWidening::createGroup(MachineInstr *BaseInst,
                                           InstrGroup &Group) {
  assert(handledInstType(BaseInst) && "Unexpected instruction");
  unsigned BaseReg = getBaseAddressRegister(BaseInst);
  InstrGroup Other;

  Group.push_back(BaseInst);
  LLVM_DEBUG(dbgs() << "BaseInst: "; BaseInst->dump());
  auto End = BaseInst->getParent()->end();
  auto I = BaseInst->getIterator();

  while (true) {
    I = std::next(I);
    if (I == End)
      break;
    MachineInstr *MI = &(*I);

    // Assume calls are aliased to everything.
    if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
        MI->hasOrderedMemoryRef())
      return;

    if (!handledInstType(MI)) {
      if (MI->mayLoadOrStore())
        Other.push_back(MI);
      continue;
    }

    // We have a handledInstType instruction
    // If this load/store instruction is aliased with anything already in the
    // group, terminate the group now.
    for (auto GI : Group)
      if (GI->mayAlias(AA, *MI, true))
        return;
    if (Mode == WideningMode::Load) {
      // Check if current load MI can be moved to the first load instruction
      // in Group. If any load instruction aliases with memory instructions in
      // Other, terminate the group.
      for (auto MemI : Other)
        if (!canSwapInstructions(MI, MemI))
          return;
    } else {
      // Check if store instructions in the group can be moved to current
      // store MI. If any store instruction aliases with memory instructions
      // in Other, terminate the group.
      for (auto MemI : Other) {
        if (std::distance(Group.back()->getIterator(), MemI->getIterator()) <=
            0)
          continue;
        for (auto GI : Group)
          if (!canSwapInstructions(MemI, GI))
            return;
      }
    }

    unsigned BR = getBaseAddressRegister(MI);
    if (BR == BaseReg) {
      LLVM_DEBUG(dbgs() << "Added MI to group: "; MI->dump());
      Group.push_back(MI);
      ProcessedInsts.insert(MI);
    }
  } // while
}

// Check if load/store instructions S1 and S2 are adjacent.  More precisely,
// S2 has to access memory immediately following that accessed by S1.
bool HexagonLoadStoreWidening::areAdjacent(const MachineInstr *S1,
                                           const MachineInstr *S2) {
  if (!handledInstType(S1) || !handledInstType(S2))
    return false;

  const MachineMemOperand &S1MO = getMemTarget(S1);

  // Currently only handling immediate stores.
  int Off1 = getOffset(S1);
  int Off2 = getOffset(S2);

  return (Off1 >= 0) ? Off1 + S1MO.getSize().getValue() == unsigned(Off2)
                     : int(Off1 + S1MO.getSize().getValue()) == Off2;
}

/// Given a sequence of adjacent loads/stores, and a maximum size of a single
/// wide inst, pick a group of insts that can be replaced by a single load/store
/// of size not exceeding MaxSize.  The selected sequence will be recorded
/// in OG ("old group" of instructions).
/// OG should be empty on entry, and should be left empty if the function
/// fails.
bool HexagonLoadStoreWidening::selectInsts(InstrGroup::iterator Begin,
                                           InstrGroup::iterator End,
                                           InstrGroup &OG, unsigned &TotalSize,
                                           unsigned MaxSize) {
  assert(Begin != End && "No instructions to analyze");
  assert(OG.empty() && "Old group not empty on entry");

  if (std::distance(Begin, End) <= 1)
    return false;

  MachineInstr *FirstMI = *Begin;
  assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
  const MachineMemOperand &FirstMMO = getMemTarget(FirstMI);
  if (!FirstMMO.getType().isValid())
    return false;

  unsigned Alignment = FirstMMO.getAlign().value();
  unsigned SizeAccum = FirstMMO.getSize().getValue();
  unsigned FirstOffset = getOffset(FirstMI);

  // The initial value of SizeAccum should always be a power of 2.
  assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2");

  // If the size of the first store equals to or exceeds the limit, do nothing.
  if (SizeAccum >= MaxSize)
    return false;

  // If the size of the first load/store is greater than or equal to the address
  // stored to, then the inst cannot be made any wider.
  if (SizeAccum >= Alignment) {
    LLVM_DEBUG(
        dbgs() << "Size of load/store greater than equal to its alignment\n");
    return false;
  }

  // The offset of a load/store will put restrictions on how wide the inst can
  // be.  Offsets in loads/stores of size 2^n bytes need to have the n lowest
  // bits be 0.  If the first inst already exhausts the offset limits, quit.
  // Test this by checking if the next wider size would exceed the limit.
  // For post-increment instructions, the increment amount needs to follow the
  // same rule.
  unsigned OffsetOrIncVal = 0;
  if (HII->isPostIncrement(*FirstMI))
    OffsetOrIncVal = getPostIncrementValue(FirstMI);
  else
    OffsetOrIncVal = FirstOffset;
  if ((2 * SizeAccum - 1) & OffsetOrIncVal) {
    LLVM_DEBUG(dbgs() << "Instruction cannot be widened as the offset/postinc"
                      << " value: " << getPostIncrementValue(FirstMI)
                      << " is invalid in the widened version\n");
    return false;
  }

  OG.push_back(FirstMI);
  MachineInstr *S1 = FirstMI;

  // Pow2Num will be the largest number of elements in OG such that the sum
  // of sizes of loads/stores 0...Pow2Num-1 will be a power of 2.
  unsigned Pow2Num = 1;
  unsigned Pow2Size = SizeAccum;
  bool HavePostInc = HII->isPostIncrement(*S1);

  // Be greedy: keep accumulating insts as long as they are to adjacent
  // memory locations, and as long as the total number of bytes stored
  // does not exceed the limit (MaxSize).
  // Keep track of when the total size covered is a power of 2, since
  // this is a size a single load/store can cover.
  for (InstrGroup::iterator I = Begin + 1; I != End; ++I) {
    MachineInstr *S2 = *I;
    // Insts are sorted, so if S1 and S2 are not adjacent, there won't be
    // any other store to fill the "hole".
    if (!areAdjacent(S1, S2))
      break;

    // Cannot widen two post increments, need to return two registers
    // with incremented values
    if (HavePostInc && HII->isPostIncrement(*S2))
      break;

    unsigned S2Size = getMemTarget(S2).getSize().getValue();
    if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
      break;

    OG.push_back(S2);
    SizeAccum += S2Size;
    if (isPowerOf2_32(SizeAccum)) {
      Pow2Num = OG.size();
      Pow2Size = SizeAccum;
    }
    if ((2 * Pow2Size - 1) & FirstOffset)
      break;

    S1 = S2;
  }

  // The insts don't add up to anything that can be widened.  Clean up.
  if (Pow2Num <= 1) {
    OG.clear();
    return false;
  }

  // Only leave the loads/stores being widened.
  OG.resize(Pow2Num);
  TotalSize = Pow2Size;
  return true;
}

/// Given an "old group" OG of insts, create a "new group" NG of instructions
/// to replace them.
bool HexagonLoadStoreWidening::createWideInsts(InstrGroup &OG, InstrGroup &NG,
                                               unsigned TotalSize) {
  if (Mode == WideningMode::Store) {
    return createWideStores(OG, NG, TotalSize);
  }
  return createWideLoads(OG, NG, TotalSize);
}

/// Given an "old group" OG of stores, create a "new group" NG of instructions
/// to replace them.  Ideally, NG would only have a single instruction in it,
/// but that may only be possible for store-immediate.
bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
                                                unsigned TotalSize) {
  // XXX Current limitations:
  // - only handle a TotalSize of up to 8

  LLVM_DEBUG(dbgs() << "Creating wide stores\n");
  if (TotalSize > MaxWideSize)
    return false;

  uint64_t Acc = 0; // Value accumulator.
  unsigned Shift = 0;
  bool HaveImm = false;
  bool HaveReg = false;

  for (MachineInstr *MI : OG) {
    const MachineMemOperand &MMO = getMemTarget(MI);
    MachineOperand &SO = HII->isPostIncrement(*MI)
                             ? MI->getOperand(3)
                             : MI->getOperand(2); // Source.
    unsigned NBits;
    uint64_t Mask;
    uint64_t Val;

    switch (SO.getType()) {
    case MachineOperand::MO_Immediate:
      LLVM_DEBUG(dbgs() << "Have store immediate\n");
      HaveImm = true;

      NBits = MMO.getSizeInBits().toRaw();
      Mask = (0xFFFFFFFFFFFFFFFFU >> (64 - NBits));
      Val = (SO.getImm() & Mask) << Shift;
      Acc |= Val;
      Shift += NBits;
      break;
    case MachineOperand::MO_Register:
      HaveReg = true;
      break;
    default:
      LLVM_DEBUG(dbgs() << "Unhandled store\n");
      return false;
    }
  }

  if (HaveImm && HaveReg) {
    LLVM_DEBUG(dbgs() << "Cannot merge store register and store imm\n");
    return false;
  }

  MachineInstr *FirstSt = OG.front();
  DebugLoc DL = OG.back()->getDebugLoc();
  const MachineMemOperand &OldM = getMemTarget(FirstSt);
  MachineMemOperand *NewM =
      MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
                               TotalSize, OldM.getAlign(), OldM.getAAInfo());
  MachineInstr *StI;
  MachineOperand &MR =
      (HII->isPostIncrement(*FirstSt) ? FirstSt->getOperand(1)
                                      : FirstSt->getOperand(0));
  auto SecondSt = OG.back();
  if (HaveReg) {
    MachineOperand FReg =
        (HII->isPostIncrement(*FirstSt) ? FirstSt->getOperand(3)
                                        : FirstSt->getOperand(2));
    // Post increments appear first in the sorted group.
    // Cannot have a post increment for the second instruction
    assert(!HII->isPostIncrement(*SecondSt) && "Unexpected PostInc");
    MachineOperand SReg = SecondSt->getOperand(2);
    assert(FReg.isReg() && SReg.isReg() &&
           "Cannot merge store register and store imm");
    const MCInstrDesc &CombD = TII->get(Hexagon::A2_combinew);
    Register VReg =
        MF->getRegInfo().createVirtualRegister(&Hexagon::DoubleRegsRegClass);
    MachineInstr *CombI = BuildMI(*MF, DL, CombD, VReg).add(SReg).add(FReg);
    NG.push_back(CombI);

    if (FirstSt->getOpcode() == Hexagon::S2_storeri_pi) {
      const MCInstrDesc &StD = TII->get(Hexagon::S2_storerd_pi);
      auto IncDestMO = FirstSt->getOperand(0);
      auto IncMO = FirstSt->getOperand(2);
      StI =
          BuildMI(*MF, DL, StD).add(IncDestMO).add(MR).add(IncMO).addReg(VReg);
    } else {
      const MCInstrDesc &StD = TII->get(Hexagon::S2_storerd_io);
      auto OffMO = FirstSt->getOperand(1);
      StI = BuildMI(*MF, DL, StD).add(MR).add(OffMO).addReg(VReg);
    }
    StI->addMemOperand(*MF, NewM);
    NG.push_back(StI);
    return true;
  }

  // Handle store immediates
  // There are no post increment store immediates on Hexagon
  assert(!HII->isPostIncrement(*FirstSt) && "Unexpected PostInc");
  auto Off = FirstSt->getOperand(1).getImm();
  if (TotalSize == 8) {
    // Create vreg = A2_tfrsi #Acc; nreg = combine(#s32, vreg); memd = nreg
    uint64_t Mask = 0xFFFFFFFFU;
    int LowerAcc = int(Mask & Acc);
    int UpperAcc = Acc >> 32;
    Register DReg =
        MF->getRegInfo().createVirtualRegister(&Hexagon::DoubleRegsRegClass);
    MachineInstr *CombI;
    if (Acc != 0) {
      const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
      Register VReg = MF->getRegInfo().createVirtualRegister(RC);
      MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc);
      NG.push_back(TfrI);
      const MCInstrDesc &CombD = TII->get(Hexagon::A4_combineir);
      CombI = BuildMI(*MF, DL, CombD, DReg)
                  .addImm(UpperAcc)
                  .addReg(VReg, RegState::Kill);
    }
    // If immediates are 0, we do not need A2_tfrsi
    else {
      const MCInstrDesc &CombD = TII->get(Hexagon::A4_combineii);
      CombI = BuildMI(*MF, DL, CombD, DReg).addImm(0).addImm(0);
    }
    NG.push_back(CombI);
    const MCInstrDesc &StD = TII->get(Hexagon::S2_storerd_io);
    StI =
        BuildMI(*MF, DL, StD).add(MR).addImm(Off).addReg(DReg, RegState::Kill);
  } else if (Acc < 0x10000) {
    // Create mem[hw] = #Acc
    unsigned WOpc = (TotalSize == 2)   ? Hexagon::S4_storeirh_io
                    : (TotalSize == 4) ? Hexagon::S4_storeiri_io
                                       : 0;
    assert(WOpc && "Unexpected size");

    int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc);
    const MCInstrDesc &StD = TII->get(WOpc);
    StI = BuildMI(*MF, DL, StD).add(MR).addImm(Off).addImm(Val);
  } else {
    // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
    const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
    Register VReg = MF->getRegInfo().createVirtualRegister(RC);
    MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc));
    NG.push_back(TfrI);

    unsigned WOpc = (TotalSize == 2)   ? Hexagon::S2_storerh_io
                    : (TotalSize == 4) ? Hexagon::S2_storeri_io
                                       : 0;
    assert(WOpc && "Unexpected size");

    const MCInstrDesc &StD = TII->get(WOpc);
    StI =
        BuildMI(*MF, DL, StD).add(MR).addImm(Off).addReg(VReg, RegState::Kill);
  }
  StI->addMemOperand(*MF, NewM);
  NG.push_back(StI);

  return true;
}

/// Given an "old group" OG of loads, create a "new group" NG of instructions
/// to replace them.  Ideally, NG would only have a single instruction in it,
/// but that may only be possible for double register loads.
bool HexagonLoadStoreWidening::createWideLoads(InstrGroup &OG, InstrGroup &NG,
                                               unsigned TotalSize) {
  LLVM_DEBUG(dbgs() << "Creating wide loads\n");
  // XXX Current limitations:
  // - only expect stores of immediate values in OG,
  // - only handle a TotalSize of up to 8
  if (TotalSize > MaxWideSize)
    return false;
  assert(OG.size() == 2 && "Expecting two elements in Instruction Group.");

  MachineInstr *FirstLd = OG.front();
  const MachineMemOperand &OldM = getMemTarget(FirstLd);
  MachineMemOperand *NewM =
      MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
                               TotalSize, OldM.getAlign(), OldM.getAAInfo());

  MachineOperand &MR = FirstLd->getOperand(0);
  MachineOperand &MRBase =
      (HII->isPostIncrement(*FirstLd) ? FirstLd->getOperand(2)
                                      : FirstLd->getOperand(1));
  DebugLoc DL = OG.back()->getDebugLoc();

  // Create the double register Load Instruction.
  Register NewMR = MRI->createVirtualRegister(&Hexagon::DoubleRegsRegClass);
  MachineInstr *LdI;

  // Post increments appear first in the sorted group
  if (FirstLd->getOpcode() == Hexagon::L2_loadri_pi) {
    auto IncDestMO = FirstLd->getOperand(1);
    auto IncMO = FirstLd->getOperand(3);
    LdI = BuildMI(*MF, DL, TII->get(Hexagon::L2_loadrd_pi))
              .addDef(NewMR, getKillRegState(MR.isKill()), MR.getSubReg())
              .add(IncDestMO)
              .add(MRBase)
              .add(IncMO);
    LdI->addMemOperand(*MF, NewM);
  } else {
    auto OffMO = FirstLd->getOperand(2);
    LdI = BuildMI(*MF, DL, TII->get(Hexagon::L2_loadrd_io))
              .addDef(NewMR, getKillRegState(MR.isKill()), MR.getSubReg())
              .add(MRBase)
              .add(OffMO);
    LdI->addMemOperand(*MF, NewM);
  }
  NG.push_back(LdI);

  auto getHalfReg = [&](MachineInstr *DoubleReg, unsigned SubReg,
                        MachineInstr *DstReg) {
    Register DestReg = DstReg->getOperand(0).getReg();
    return BuildMI(*MF, DL, TII->get(Hexagon::COPY), DestReg)
        .addReg(NewMR, getKillRegState(LdI->isKill()), SubReg);
  };

  MachineInstr *LdI_lo = getHalfReg(LdI, Hexagon::isub_lo, FirstLd);
  MachineInstr *LdI_hi = getHalfReg(LdI, Hexagon::isub_hi, OG.back());
  NG.push_back(LdI_lo);
  NG.push_back(LdI_hi);

  return true;
}

// Replace instructions from the old group OG with instructions from the
// new group NG.  Conceptually, remove all instructions in OG, and then
// insert all instructions in NG, starting at where the first instruction
// from OG was (in the order in which they appeared in the basic block).
// (The ordering in OG does not have to match the order in the basic block.)
bool HexagonLoadStoreWidening::replaceInsts(InstrGroup &OG, InstrGroup &NG) {
  LLVM_DEBUG({
    dbgs() << "Replacing:\n";
    for (auto I : OG)
      dbgs() << "  " << *I;
    dbgs() << "with\n";
    for (auto I : NG)
      dbgs() << "  " << *I;
  });

  MachineBasicBlock *MBB = OG.back()->getParent();
  MachineBasicBlock::iterator InsertAt = MBB->end();

  // Need to establish the insertion point.
  // For loads the best one is right before the first load in the OG,
  // but in the order in which the insts occur in the program list.
  // For stores the best point is right after the last store in the OG.
  // Since the ordering in OG does not correspond
  // to the order in the program list, we need to do some work to find
  // the insertion point.

  // Create a set of all instructions in OG (for quick lookup).
  InstrSet OldMemInsts(llvm::from_range, OG);

  if (Mode == WideningMode::Load) {
    // Find the first load instruction in the block that is present in OG.
    for (auto &I : *MBB) {
      if (OldMemInsts.count(&I)) {
        InsertAt = I;
        break;
      }
    }

    assert((InsertAt != MBB->end()) && "Cannot locate any load from the group");

    for (auto *I : NG)
      MBB->insert(InsertAt, I);
  } else {
    // Find the last store instruction in the block that is present in OG.
    auto I = MBB->rbegin();
    for (; I != MBB->rend(); ++I) {
      if (OldMemInsts.count(&(*I))) {
        InsertAt = (*I).getIterator();
        break;
      }
    }

    assert((I != MBB->rend()) && "Cannot locate any store from the group");

    for (auto I = NG.rbegin(); I != NG.rend(); ++I)
      MBB->insertAfter(InsertAt, *I);
  }

  for (auto *I : OG)
    I->eraseFromParent();

  return true;
}

// Break up the group into smaller groups, each of which can be replaced by
// a single wide load/store.  Widen each such smaller group and replace the old
// instructions with the widened ones.
bool HexagonLoadStoreWidening::processGroup(InstrGroup &Group) {
  bool Changed = false;
  InstrGroup::iterator I = Group.begin(), E = Group.end();
  InstrGroup OG, NG; // Old and new groups.
  unsigned CollectedSize;

  while (I != E) {
    OG.clear();
    NG.clear();

    bool Succ = selectInsts(I++, E, OG, CollectedSize, MaxWideSize) &&
                createWideInsts(OG, NG, CollectedSize) && replaceInsts(OG, NG);
    if (!Succ)
      continue;

    assert(OG.size() > 1 && "Created invalid group");
    assert(std::distance(I, E) + 1 >= int(OG.size()) && "Too many elements");
    I += OG.size() - 1;

    Changed = true;
  }

  return Changed;
}

// Process a single basic block: create the load/store groups, and replace them
// with the widened insts, if possible.  Processing of each basic block
// is independent from processing of any other basic block.  This transfor-
// mation could be stopped after having processed any basic block without
// any ill effects (other than not having performed widening in the unpro-
// cessed blocks).  Also, the basic blocks can be processed in any order.
bool HexagonLoadStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
  InstrGroupList SGs;
  bool Changed = false;

  // To prevent long compile time check for max BB size.
  if (MBB.size() > MaxMBBSizeForLoadStoreWidening)
    return false;

  createGroups(MBB, SGs);

  auto Less = [this](const MachineInstr *A, const MachineInstr *B) -> bool {
    return getOffset(A) < getOffset(B);
  };
  for (auto &G : SGs) {
    assert(G.size() > 1 && "Group with fewer than 2 elements");
    llvm::sort(G, Less);

    Changed |= processGroup(G);
  }

  return Changed;
}

bool HexagonLoadStoreWidening::run() {
  bool Changed = false;

  for (auto &B : *MF)
    Changed |= processBasicBlock(B);

  return Changed;
}

FunctionPass *llvm::createHexagonStoreWidening() {
  return new HexagonStoreWidening();
}

FunctionPass *llvm::createHexagonLoadWidening() {
  return new HexagonLoadWidening();
}
