//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the KernelInfoPrinter class used to emit remarks about
// function properties from a GPU kernel.
//
//===----------------------------------------------------------------------===//

#include "llvm/Analysis/KernelInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"

using namespace llvm;

#define DEBUG_TYPE "kernel-info"

namespace {

/// Data structure holding function info for kernels.
class KernelInfo {
  void updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE);

public:
  static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
                             TargetMachine *TM);

  /// Whether the function has external linkage and is not a kernel function.
  bool ExternalNotKernel = false;

  /// Launch bounds.
  SmallVector<std::pair<StringRef, int64_t>> LaunchBounds;

  /// The number of alloca instructions inside the function, the number of those
  /// with allocation sizes that cannot be determined at compile time, and the
  /// sum of the sizes that can be.
  ///
  /// With the current implementation for at least some GPU archs,
  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
  /// case the implementation changes.
  int64_t Allocas = 0;
  int64_t AllocasDyn = 0;
  int64_t AllocasStaticSizeSum = 0;

  /// Number of direct/indirect calls (anything derived from CallBase).
  int64_t DirectCalls = 0;
  int64_t IndirectCalls = 0;

  /// Number of direct calls made from this function to other functions
  /// defined in this module.
  int64_t DirectCallsToDefinedFunctions = 0;

  /// Number of direct calls to inline assembly.
  int64_t InlineAssemblyCalls = 0;

  /// Number of calls of type InvokeInst.
  int64_t Invokes = 0;

  /// Target-specific flat address space.
  unsigned FlatAddrspace;

  /// Number of flat address space memory accesses (via load, store, etc.).
  int64_t FlatAddrspaceAccesses = 0;
};

} // end anonymous namespace

static void identifyCallee(OptimizationRemark &R, const Module *M,
                           const Value *V, StringRef Kind = "") {
  SmallString<100> Name; // might be function name or asm expression
  if (const Function *F = dyn_cast<Function>(V)) {
    if (auto *SubProgram = F->getSubprogram()) {
      if (SubProgram->isArtificial())
        R << "artificial ";
      Name = SubProgram->getName();
    }
  }
  if (Name.empty()) {
    raw_svector_ostream OS(Name);
    V->printAsOperand(OS, /*PrintType=*/false, M);
  }
  if (!Kind.empty())
    R << Kind << " ";
  R << "'" << Name << "'";
}

static void identifyFunction(OptimizationRemark &R, const Function &F) {
  identifyCallee(R, F.getParent(), &F, "function");
}

static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller,
                         const AllocaInst &Alloca,
                         TypeSize::ScalarTy StaticSize) {
  ORE.emit([&] {
    StringRef DbgName;
    DebugLoc Loc;
    bool Artificial = false;
    auto DVRs = findDVRDeclares(&const_cast<AllocaInst &>(Alloca));
    if (!DVRs.empty()) {
      const DbgVariableRecord &DVR = **DVRs.begin();
      DbgName = DVR.getVariable()->getName();
      Loc = DVR.getDebugLoc();
      Artificial = DVR.Variable->isArtificial();
    }
    OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc),
                         Alloca.getParent());
    R << "in ";
    identifyFunction(R, Caller);
    R << ", ";
    if (Artificial)
      R << "artificial ";
    SmallString<20> ValName;
    raw_svector_ostream OS(ValName);
    Alloca.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
    R << "alloca ('" << ValName << "') ";
    if (!DbgName.empty())
      R << "for '" << DbgName << "' ";
    else
      R << "without debug info ";
    R << "with ";
    if (StaticSize)
      R << "static size of " << itostr(StaticSize) << " bytes";
    else
      R << "dynamic size";
    return R;
  });
}

static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
                       const CallBase &Call, StringRef CallKind,
                       StringRef RemarkKind) {
  ORE.emit([&] {
    OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call);
    R << "in ";
    identifyFunction(R, Caller);
    R << ", " << CallKind << ", callee is ";
    identifyCallee(R, Caller.getParent(), Call.getCalledOperand());
    return R;
  });
}

static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
                                      const Function &Caller,
                                      const Instruction &Inst) {
  ORE.emit([&] {
    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst);
    R << "in ";
    identifyFunction(R, Caller);
    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
      R << ", '" << II->getCalledFunction()->getName() << "' call";
    } else {
      R << ", '" << Inst.getOpcodeName() << "' instruction";
    }
    if (!Inst.getType()->isVoidTy()) {
      SmallString<20> Name;
      raw_svector_ostream OS(Name);
      Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
      R << " ('" << Name << "')";
    }
    R << " accesses memory in flat address space";
    return R;
  });
}

void KernelInfo::updateForBB(const BasicBlock &BB,
                             OptimizationRemarkEmitter &ORE) {
  const Function &F = *BB.getParent();
  const Module &M = *F.getParent();
  const DataLayout &DL = M.getDataLayout();
  for (const Instruction &I : BB.instructionsWithoutDebug()) {
    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
      ++Allocas;
      TypeSize::ScalarTy StaticSize = 0;
      if (std::optional<TypeSize> Size = Alloca->getAllocationSize(DL)) {
        StaticSize = Size->getFixedValue();
        assert(StaticSize <=
               (TypeSize::ScalarTy)std::numeric_limits<int64_t>::max());
        AllocasStaticSizeSum += StaticSize;
      } else {
        ++AllocasDyn;
      }
      remarkAlloca(ORE, F, *Alloca, StaticSize);
    } else if (const CallBase *Call = dyn_cast<CallBase>(&I)) {
      SmallString<40> CallKind;
      SmallString<40> RemarkKind;
      if (Call->isIndirectCall()) {
        ++IndirectCalls;
        CallKind += "indirect";
        RemarkKind += "Indirect";
      } else {
        ++DirectCalls;
        CallKind += "direct";
        RemarkKind += "Direct";
      }
      if (isa<InvokeInst>(Call)) {
        ++Invokes;
        CallKind += " invoke";
        RemarkKind += "Invoke";
      } else {
        CallKind += " call";
        RemarkKind += "Call";
      }
      if (!Call->isIndirectCall()) {
        if (const Function *Callee = Call->getCalledFunction()) {
          if (!Callee->isIntrinsic() && !Callee->isDeclaration()) {
            ++DirectCallsToDefinedFunctions;
            CallKind += " to defined function";
            RemarkKind += "ToDefinedFunction";
          }
        } else if (Call->isInlineAsm()) {
          ++InlineAssemblyCalls;
          CallKind += " to inline assembly";
          RemarkKind += "ToInlineAssembly";
        }
      }
      remarkCall(ORE, F, *Call, CallKind, RemarkKind);
      if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
        if (MI->getDestAddressSpace() == FlatAddrspace) {
          ++FlatAddrspaceAccesses;
          remarkFlatAddrspaceAccess(ORE, F, I);
        } else if (const AnyMemTransferInst *MT =
                       dyn_cast<AnyMemTransferInst>(MI)) {
          if (MT->getSourceAddressSpace() == FlatAddrspace) {
            ++FlatAddrspaceAccesses;
            remarkFlatAddrspaceAccess(ORE, F, I);
          }
        }
      }
    } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
      if (Load->getPointerAddressSpace() == FlatAddrspace) {
        ++FlatAddrspaceAccesses;
        remarkFlatAddrspaceAccess(ORE, F, I);
      }
    } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
      if (Store->getPointerAddressSpace() == FlatAddrspace) {
        ++FlatAddrspaceAccesses;
        remarkFlatAddrspaceAccess(ORE, F, I);
      }
    } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
      if (At->getPointerAddressSpace() == FlatAddrspace) {
        ++FlatAddrspaceAccesses;
        remarkFlatAddrspaceAccess(ORE, F, I);
      }
    } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
      if (At->getPointerAddressSpace() == FlatAddrspace) {
        ++FlatAddrspaceAccesses;
        remarkFlatAddrspaceAccess(ORE, F, I);
      }
    }
  }
}

static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
                           StringRef Name, int64_t Value) {
  ORE.emit([&] {
    OptimizationRemark R(DEBUG_TYPE, Name, &F);
    R << "in ";
    identifyFunction(R, F);
    R << ", " << Name << " = " << itostr(Value);
    return R;
  });
}

static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
                                                   StringRef Name) {
  if (!F.hasFnAttribute(Name))
    return std::nullopt;
  return F.getFnAttributeAsParsedInteger(Name);
}

void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
                                TargetMachine *TM) {
  KernelInfo KI;
  TargetTransformInfo &TheTTI = FAM.getResult<TargetIRAnalysis>(F);
  KI.FlatAddrspace = TheTTI.getFlatAddressSpace();

  // Record function properties.
  KI.ExternalNotKernel = F.hasExternalLinkage() && !F.hasKernelCallingConv();
  for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) {
    if (auto Val = parseFnAttrAsInteger(F, Name))
      KI.LaunchBounds.push_back({Name, *Val});
  }
  TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds);

  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  for (const auto &BB : F)
    KI.updateForBB(BB, ORE);

#define REMARK_PROPERTY(PROP_NAME)                                             \
  remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
  REMARK_PROPERTY(ExternalNotKernel);
  for (auto LB : KI.LaunchBounds)
    remarkProperty(ORE, F, LB.first, LB.second);
  REMARK_PROPERTY(Allocas);
  REMARK_PROPERTY(AllocasStaticSizeSum);
  REMARK_PROPERTY(AllocasDyn);
  REMARK_PROPERTY(DirectCalls);
  REMARK_PROPERTY(IndirectCalls);
  REMARK_PROPERTY(DirectCallsToDefinedFunctions);
  REMARK_PROPERTY(InlineAssemblyCalls);
  REMARK_PROPERTY(Invokes);
  REMARK_PROPERTY(FlatAddrspaceAccesses);
#undef REMARK_PROPERTY
}

PreservedAnalyses KernelInfoPrinter::run(Function &F,
                                         FunctionAnalysisManager &AM) {
  // Skip it if remarks are not enabled as it will do nothing useful.
  if (F.getContext().getDiagHandlerPtr()->isPassedOptRemarkEnabled(DEBUG_TYPE))
    KernelInfo::emitKernelInfo(F, AM, TM);
  return PreservedAnalyses::all();
}
