admin管理员组文章数量:1594241
本系列文章是对GPU LLVM后端的探索与学习,后端的学习资料主要有LLVM源码和公开的Spec. 众所周知,在PC GPU领域的玩家主要有三家公司:NV, AMD, INTEL. 在LLVM 后端开源的代码只有NV, AMD,而AMD相对NV的文档分享会更Open一些,比较容易找到它的Spec. 所以整个系统文章会以AMD的GPU作为研究目标,AMD 开源的代码对应的芯片有两款:R600 , GCN,本系列文章关注GCN,因为它是更新的架构. 以上是基本背景。
0. 预备知识
编译LLVM llc, 通过llc工具,使用以下命令,并下断点在addPassesToGenerateCode。代码使用了TargetPassConfig 类,它是后端Codegen 模块常用的Pass Pipeline 配置工具类,AMDGPU后端会继承TargetPassConfig, 并重载Target-Specific接口,从设计模式角度来看,这是典型的Template Method Pattern, 将具体任务交给子类, 我们重点关注AMD注册了那些Pass. 从宏观角度来看:Codegen Pass主要分为两个部分:指令选择 和 Machine层优化
llc.exe -mtriple=amdgcn G:\Mesa3D_ws\llvm-11.0.0.src\llvm-11.0.0.src\test\CodeGen\AMDGPU\add.ll
addPassesToGenerateCode
static TargetPassConfig *
addPassesToGenerateCode(LLVMTargetMachine &TM, PassManagerBase &PM,
bool DisableVerify,
MachineModuleInfoWrapperPass &MMIWP) {
// Targets may override createPassConfig to provide a target-specific
// subclass.
TargetPassConfig *PassConfig = TM.createPassConfig(PM);
// Set PassConfig options provided by TargetMachine.
PassConfig->setDisableVerify(DisableVerify);
PM.add(PassConfig);
PM.add(&MMIWP);
if (PassConfig->addISelPasses()) // 指令选择
return nullptr;
PassConfig->addMachinePasses(); // Machine层优化
PassConfig->setInitialized();
return PassConfig;
}
1. 指令选择Pass
TargetPassConfig::addISelPasses
bool TargetPassConfig::addISelPasses() {
if (TM->useEmulatedTLS())
addPass(createLowerEmuTLSPass());
addPass(createPreISelIntrinsicLoweringPass());
addPass(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
addIRPasses(); // 多态
addCodeGenPrepare(); // 多态
addPassesToHandleExceptions();
addISelPrepare();
return addCoreISelPasses();
}
AMDGPUPassConfig
class AMDGPUPassConfig : public TargetPassConfig{
...
void addIRPasses() override;
void addCodeGenPrepare() override;
bool addPreISel() override;
bool addInstSelector() override;
...
}
GCNPassConfig
class GCNPassConfig final : public AMDGPUPassConfig {
public:
....
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override;
bool addPreISel() override;
void addMachineSSAOptimization() override;
bool addILPOpts() override;
bool addInstSelector() override;
bool addIRTranslator() override;
void addPreLegalizeMachineIR() override;
bool addLegalizeMachineIR() override;
void addPreRegBankSelect() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
void addPreRegAlloc() override;
bool addPreRewrite() override;
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
};
1.1 AMDGPUPassConfig::addIRPasses()
addIRPasses 接口注册了LLVM IR层面的优化, 从源码可以看出在调用TargetPassConfig::addIRPasses() 通用优化Pass pipeline之前, AMD Target有些Patch工作需要处理 , 同时使用了一些标量优化.
void AMDGPUPassConfig::addIRPasses() {
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
// There is no reason to run these.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
addPass(createAMDGPUPrintfRuntimeBinding());
addPass(createAMDGPUFixFunctionBitcastsPass());
addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
addPass(createAtomicExpandPass());
addPass(createAMDGPULowerIntrinsicsPass());
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
addPass(createBarrierNoopPass());
if (TM.getTargetTriple().getArch() == Triple::r600)
addPass(createR600OpenCLImageTypeLoweringPass());
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createInferAddressSpacesPass());
addPass(createAMDGPUPromoteAlloca());
if (EnableSROA)
addPass(createSROAPass());
if (EnableScalarIRPasses)
addStraightLineScalarOptimizationPasses(); // Amd gpu 标量优化
if (EnableAMDGPUAliasAnalysis) {
addPass(createAMDGPUAAWrapperPass());
addPass(createExternalAAWrapperPass([](Pass &P, Function &,
AAResults &AAR) {
if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
AAR.addAAResult(WrapperPass->getResult());
}));
}
}
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
addPass(createAMDGPUCodeGenPreparePass());
}
TargetPassConfig::addIRPasses(); // 通用优化Pass
if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
addEarlyCSEOrGVNPass();
}
AMD GPU 标量优化 Pass
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
addPass(createLICMPass());
addPass(createSeparateConstOffsetFromGEPPass());
addPass(createSpeculativeExecutionPass());
// ReassociateGEPs exposes more opportunites for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
addPass(createStraightLineStrengthReducePass());
// SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
// EarlyCSE can reuse.
addEarlyCSEOrGVNPass();
// Run NaryReassociate after EarlyCSE/GVN to be more effective.
addPass(createNaryReassociatePass());
// NaryReassociate on GEPs creates redundant common expressions, so run
// EarlyCSE after it.
addPass(createEarlyCSEPass());
}
1.2 AMDGPUPassConfig::addCodeGenPrepare()
addCodeGenPrepare 接口注册了一些Codegen代码之前的一些准备工作, 同样从源码可以看出在调用通用TargetPassConfig::addCodeGenPrepare()之前, AMD Target 也有些Patch工作需要处理.
void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().getArch() == Triple::amdgcn)
addPass(createAMDGPUAnnotateKernelFeaturesPass());
if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
addPass(&AMDGPUPerfHintAnalysisID);
TargetPassConfig::addCodeGenPrepare(); // 通用addCodeGenPrepare
if (EnableLoadStoreVectorizer)
addPass(createLoadStoreVectorizerPass());
addPass(createLowerSwitchPass());
}
1.3 TargetPassConfig::addISelPrepare
void TargetPassConfig::addISelPrepare() {
addPreISel(); // 多态 -> GCNPassConfig::addPreISel()
if (requiresCodeGenSCCOrder())
addPass(new DummyCGSCCPass);
addPass(createSafeStackPass());
addPass(createStackProtectorPass());
if (PrintISelInput)
addPass(createPrintFunctionPass(dbgs(), "\n\n*** Final LLVM Code input to ISel ***\n"));
if (!DisableVerify)
addPass(createVerifierPass());
}
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
if (EnableAtomicOptimizations) {
addPass(createAMDGPUAtomicOptimizerPass());
}
addPass(&AMDGPUUnifyDivergentExitNodesID);
if (!LateCFGStructurize) {
if (EnableStructurizerWorkarounds) {
addPass(createFixIrreduciblePass());
addPass(createUnifyLoopExitsPass());
}
addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
}
addPass(createSinkingPass());
addPass(createAMDGPUAnnotateUniformValues());
if (!LateCFGStructurize) {
addPass(createSIAnnotateControlFlowPass());
}
addPass(createLCSSAPass());
return false;
}
bool AMDGPUPassConfig::addPreISel() {
addPass(createFlattenCFGPass());
return false;
}
1.4 TargetPassConfig::addCoreISelPasses
LLVM 指令选择算法有三种:(Todo: 后续文章会详细介绍这三种算法,链接:)
- FastISel
- SelectionDAG
- GlobalISel
SelectionDAG 为默认的指令选择算法,后端代码也会继承这个类,并重载部分接口,设计模式也是Template Method.
bool TargetPassConfig::addCoreISelPasses() {
// Determine an instruction selector.
enum class SelectorType { SelectionDAG, FastISel, GlobalISel };
SelectorType Selector;
....
// Add instruction selector passes.
if (Selector == SelectorType::GlobalISel) {
....
}
else if (addInstSelector()) // GCNPassConfig::addInstSelector
return true;
addPass(&FinalizeISelID);
printAndVerify("After Instruction Selection");
return false;
}
1.4 GCNPassConfig::addInstSelector
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector(); // SelectionDAG
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
addPass(&FinalizeISelID);
addPass(createSIFixupVectorISelPass());
addPass(createSIAddIMGInitPass());
return false;
}
bool AMDGPUPassConfig::addInstSelector() {
// Defer the verifier until FinalizeISel.
addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false); // AMD SelectionDAG
return false;
}
2. Machine 层优化
Machine层优化也分为通用 + 定制,
TargetPassConfig::addMachinePasses
void TargetPassConfig::addMachinePasses() {
AddingMachinePasses = true;
....
if (getOptLevel() != CodeGenOpt::None) {
addMachineSSAOptimization(); // 重载
} else {
addPass(&LocalStackSlotAllocationID);
}
addPreRegAlloc(); // 重载
if (getOptimizeRegAlloc())
addOptimizedRegAlloc(); //重载
else
addFastRegAlloc(); // 重载
addPostRegAlloc(); // 重载
...
addPreSched2(); // 重载
....
addPreEmitPass(); // 重载
...
addPreEmitPass2(); // 重载
}
3. 总结
《LLVM Getting Started with LLVM Core Libraries》书中的后端框架示意图:
AMDGPU 后端Pipeline 如下图所示:
4. 参考资料
- https://en.wikipedia/wiki/Graphics_Core_Next#Instruction_set
- llvm11 source code
版权声明:本文标题:LLVM AMDGPU 后端代码分析研究(1):PassPipe Line 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:https://www.elefans.com/dianzi/1726768819a1083612.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论