MLIR-AIE Vectorization

这篇博客主要解读AMD的mlir-aie开源项目针对vectorization提出的编译优化方法实现。主要关注aie-vectorize这个优化pass。

Pass Pipeline

VectorState

// A struct to pack the global state required for vectorization at one place.
// Local to this translation unit.
struct VectState {
  // A vector of all the reuse intervals created. Class IntervalReuse represents
  // the cluster of data access (with reuse potential) along the vectorized
  // dimension of each array, It clusters together reads that have a potential
  // of vector-level data reuse. Therefore, array accesses A[i][j:j+8] and
  // A[i+2][j:j+8] will map to different IntervalReuse objects.
  SmallVector<IntervalReuse *, 16> reuseIntervals;
  // Map from a transfer_read operation to the IntervalReuse object it belongs
  // to.
  mlir::DenseMap<Operation *, IntervalReuse *> opToIntervalMap;
  // Map from a transfer_read operation to its linearized access expression.
  // Linearized expression for access A[i][j], where A is of dimensionality MxN
  // is (i*N+j). We assume that the innermost dimension is the vectorized
  // dimension.
  mlir::DenseMap<Operation *, AffineExpr> linearizedAccess;
  // A map from an index (of array access) to an expr dim map (e.g., i->d0). We
  // need this to create the correct linearized expressions for all the array
  // accesses in the function.
  mlir::DenseMap<Value, AffineExpr> indexToExprDimMap;
  // For each transfer_read operation, a map from its container basic block to
  // the enclosing for/while loops. This helps us identify two instructions
  // that are nested together, even if they belong to different basic blocks.
  mlir::DenseMap<Block *, SmallVector<Operation *, 8>> blockToEnclosingLoops;
  // This is specific to 8x8 scheme. For an 8x8 scheme, every mul/fma is
  // replaced by two mul/fmas in AIE dialect. So we keep track of the pair.
  mlir::DenseMap<Operation *, Operation *> pairedOp;
  // If we fuse a representative mul/fma op with another fma op to exploit the
  // column topology of the AIE intrinsic, then cache, for the representative
  // op, the compile-time constant access distance between their two operands.
  // The first(second) offset of the pair represents the access distance
  // between the first(second) operands of the representative op and the the
  // fused op(s). This access distance will be used to compute the xstep/zstep
  // attribute.
  mlir::DenseMap<Operation *, std::pair<int32_t, int32_t>> opToColOffsets;
  // Map from the sext op to the def op of the sext operand.
  mlir::DenseMap<Operation *, Operation *> sextTruncDefMap;
  // A set of operations that are msc (fmsub) ops. We do not differentiate
  // between mac and msc ops at vector dialect level. The only op in vector
  // dialect is just FMA op.
  llvm::SmallSet<Operation *, 8> mscOps;
  // Used to build and insert all the new operations created.
  OpBuilder builder;
  // The shift val for ups and srs intinsics. This value should be between 0
  // and 63.
  int8_t shift;
  // The zero offset, indicating the position of recurring 0 in the input
  // filter. The counting starts at 1. For example, if the filter array is
  // {1,2,3,0,4,5,6,0,7,8,9,0}, then zeroOffset=4.
  int32_t zeroOffset;
  // The duplicate count, indicating the number of times a value is duplicated
  // in the filter. The filter values must be duplicated at least twice for the
  // i8xi8 scheme. An example of filter for i8xi8 scheme is {0,0,1,1,2,2,3,3},
  // with dupFactor=2.
  int32_t dupFactor;

  bool unalignedLoadsCheck, aieml;

  // Constructors
  VectState(MLIRContext *context, int8_t s, int32_t z, int32_t d,
            bool unalignedLoadsCheck, bool aieml)
      : builder(context), shift(s), zeroOffset(z), dupFactor(d),
        unalignedLoadsCheck(unalignedLoadsCheck), aieml(aieml) {}

  IntervalReuse *getIntervalForOperation(Operation *op);
};

Interval Reuse

整个向量化的一大重点是向量寄存器复用合并。这部分在源码中主要数据结构在IntervalReuse.cpp和IntervalReuse.h两个文件中。这一小节重点分析间断重用分析算法的具体实现。

//===- IntervalReuse.h - AIE Vector Data Reuse Computation ------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2022 Xilinx Inc.
//
//===----------------------------------------------------------------------===//
// Class to compute potential data reuse in AIE vectors
//===----------------------------------------------------------------------===//

#ifndef AIE_DIALECT_AIEVEC_TRANSFORMS_INTERVALREUSE_H
#define AIE_DIALECT_AIEVEC_TRANSFORMS_INTERVALREUSE_H

#include "aie/Dialect/AIEVec/AIEVecUtils.h"

#include <utility>

namespace xilinx::aievec {

// 这个class是AMD提供的一个很重要的优化点，重用interval。这个优化点在vyasa中提出。
// The IntervalReuse class.
// This class captures the potential data reuse in the AIE vector. Each load
// from memory (coming from transfer_read op) will exclusively belong to one of
// the IntervalReuse object. Note that in AIE, the minimum vector size is
// 128-bit, and they are aligned to the vector size boundary.
// Let A and B be NxN arrays of 32-bit values. We assume no aliasing. Consider
// the following three cases, where we want to determine which IntervalReuse
// object the two read op accesses within the same loop nest will be mapped to:
// 1. Accesses A[i][j:j+8] and A[i][j+1:j+9]: they exhibit data reuse if we load
// the range  A[j:j+16] from memory into a 512-bit vector. Therefore, these two
// read ops will belong to the same IntervalReuse object.
// 2. Accesses A[i][j:j+8] and A[i+1][j:j+8]: there is no non-trivial
// vector-level reuse possible for these accesses, and they must belong to
// different IntervalReuse objects.
// 3. Accesses A[i][j:j+8] and B[i][j:j+8]: there is no possible reuse, since
// the read ops access different arrays. Therefore, these two read ops must
// belong to different IntervalReuse objects.
// We want to correctly map read ops to IntervalReuse object in all these three
// cases.
//
// 理解Interval分析中的base以及memref的含义
// 只有同样的memref和base的访问才有可能重用数据。memref是指向数组的指针，base是指向数组元素的偏移量。
// The class contains 'memref' and 'base' to disambiguate reuse. 'memref'
// differentiates different arrays. For example, 'memref' for arrays A and B
// will be different. For accesses coming from the same array, 'base' helps in
// disambiguation. 'base' is just the linearized base expression of the access.
// The linearized expression for A[i][j] is i*N+j. We decompose it into base
// (i*N+j), and offset 0. In contrast, the linearized expression for
// A[i+1][j+2] is (i+1)*N+j+2, and we decompose it into base (i+1)*N+j, and
// offset 2. Basically, we abstract away the constant offset from the
// linearized access expression to form the base.
// Given a vector of IntervalReuse objects, we just search for an object with
// matching 'memref' and 'base' to group the read ops that can potentially
// reuse data in vector.
//
// 记录读取了多大数据范围
// 'extentMap' stores the extent of data that an op reads. We store the extent
// in bits. For example, the extent for operation reading A[i][j:j+8] is
// [0,256].
// load会强制aligned到512位边界
// The 'read access extent' corresponds to the aligned chunk of data that an
// operation loads. For example, an 8-lane, 32-bit vector load from A[i+7:i+15]
// would have read access extent [0:512], whereas under same conditions, the
// vector load from A[i+9:i+17] would have read access extent [512:768]. Note
// how the extents are aligned to vector size boundary.
//
// 'intervals' merges overlapping intervals to give the view of actual AIE
// vectors that need to be created. Since AIE only allows aligned vector loads,
// each interval is aligned to vector size. Continuing with the previous
// example, the two extents [0,256] and [128,512] overlap. Therefore these will
// be merged together to form a single interval [0,512]. The entries into
// 'intervals' are sorted, so given an op, we can find its interval by doing
// binary search with the op's extent as key (e.g, searching for [128,256] in
// {[0,512],[512,1024]}).

class IntervalReuse {
  // differentiate arrays (e.g., A vs. B)
  // memref用来区别A和B两个数组
  mlir::Value memref;
  // differentiate accesses coming from the same array, but with different base
  // expression along the non-vectorized dimension (e.g., A[i+1][j:j+8] vs.
  // A[i][j:j+8];
  // 同一数组的不同访问，base是线性化的表达式
  mlir::AffineExpr base;
  // A map from each read operation to the extent of bits it reads (aligned to
  // vector size).
  // 读操作的extent是以位为单位的，aligned到vector size边界（512bits）
  llvm::DenseMap<mlir::Operation *, std::pair<int32_t, int32_t>> extentMap;
  // 无overlap的intervals列表
  // Disjoint intervals of all the data accesses (i.e., read bits). Each
  // interval entry corresponds to memory load into an AIE vec.
  llvm::SmallVector<std::pair<int32_t, int32_t>, 8> intervals;
  // lmul为例，LHS是RHS的两倍
  // Identify all the vectors that are only used as LHS operands of mul/mac op.
  // The LHS operand of mul/mac ops have specific size requirement.
  llvm::SmallVector<bool, 8> vecIsLHSOperand;

  // Return true if this array access comes from the same array
  bool sameMemRef(mlir::Value m) { return memref == m; }
  // Return true if this array access has the same invariant base
  // expression.
  bool sameInvariantIndices(mlir::AffineExpr b) { return base == b; }
  // 目前只有同一loop下的operation才会去做overlap分析。
  // Return true if this array access is enclosed within the same loop nest as
  // other accesses belonging to the same IntervalReuse object.
  bool sameEnclosingLoops(
      mlir::Operation *op,
      llvm::DenseMap<mlir::Block *, llvm::SmallVector<mlir::Operation *, 8>>
          &blockToEnclosingLoops);

  // 显示维护intervals，operation通过索引的方式找寻到对应的interval
  // For an operation, get the index into intervals that subsumes the
  // operation's access extent.
  size_t getIntervalIndex(mlir::Operation *op);

public:
  // Return true if this read operation has a potential data reuse with other
  // read operations in this IntervalReuse.
  // 判断该operation在当前同一loop下是否有可重用寄存器
  bool potentialReuse(
      mlir::vector::TransferReadOp readOp, mlir::AffineExpr invariantBase,
      llvm::DenseMap<mlir::Block *, llvm::SmallVector<mlir::Operation *, 8>>
          &blockToEnclosingLoops);
  // Insert the access extent of this read operation into intervals
  // 必须对齐到AIE向量最小size：128bits
  void insertInterval(mlir::vector::TransferReadOp readOp,
                      llvm::DenseMap<mlir::Operation *, IntervalReuse *>
                          &dataAccessToIntervalMap,
                      int32_t offset, int32_t forLoopStepSize,
                      bool isSplat = false,
                      unsigned minVecSize = 128 /*min AIE vec size*/);
  // 获取一个operation的访问范围，应该要做对齐
  // For a read operation, return the width of the interval its access extent
  // belongs to. The interval width corresponds to the size of the vector that
  // will hold the load from read operation.
  int32_t getIntervalWidth(mlir::Operation *op);
  // Get the read access extent of this read operation. The returned value
  // indicates the start and end offsets of the access from the base (in bits).
  std::pair<int32_t, int32_t> getAccessExtent(mlir::Operation *op);
  // Set the read access extent of this read operation.
  void setAccessExtent(mlir::Operation *op,
                       std::pair<int32_t, int32_t> &extent);
  // Get the interval that contains this read operation's extent
  std::pair<int32_t, int32_t> getInterval(mlir::Operation *op);
  // Given that the read operation 'op' is only LHS operand of some mul/mac
  // op, mark the vector that will load its access extent.
  void markLHSOperandVec(mlir::Operation *op);
  // If the interval corresponds to a vector that is marked as the exclusive
  // LHS operand of some mul/mac op, and if its size is <= 256, try to coalesce
  // it with the next interval.
  // interval相临，并且mac/mul操作的LHS操作数是256位的向量时，尝试合并
  void coalesceIntervals();
  // Constructors
  IntervalReuse(mlir::vector::TransferReadOp readOp, mlir::AffineExpr b)
      : memref(readOp.getSource()), base(b) {}
  IntervalReuse() : memref(nullptr), base(nullptr) {}
};

} // namespace xilinx::aievec
// end namespace xilinx

#endif // AIE_DIALECT_AIEVEC_TRANSFORMS_INTERVALREUSE_H

VecState

struct VectState {
  // A vector of all the reuse intervals created. Class IntervalReuse represents
  // the cluster of data access (with reuse potential) along the vectorized
  // dimension of each array, It clusters together reads that have a potential
  // of vector-level data reuse. Therefore, array accesses A[i][j:j+8] and
  // A[i+2][j:j+8] will map to different IntervalReuse objects.
  SmallVector<IntervalReuse *, 16> reuseIntervals;
  // Map from a transfer_read operation to the IntervalReuse object it belongs
  // to.
  mlir::DenseMap<Operation *, IntervalReuse *> opToIntervalMap;
  // Map from a transfer_read operation to its linearized access expression.
  // Linearized expression for access A[i][j], where A is of dimensionality MxN
  // is (i*N+j). We assume that the innermost dimension is the vectorized
  // dimension.
  mlir::DenseMap<Operation *, AffineExpr> linearizedAccess;
  // A map from an index (of array access) to an expr dim map (e.g., i->d0). We
  // need this to create the correct linearized expressions for all the array
  // accesses in the function.
  mlir::DenseMap<Value, AffineExpr> indexToExprDimMap;
  // For each transfer_read operation, a map from its container basic block to
  // the enclosing for/while loops. This helps us identify two instructions
  // that are nested together, even if they belong to different basic blocks.
  mlir::DenseMap<Block *, SmallVector<Operation *, 8>> blockToEnclosingLoops;
  // This is specific to 8x8 scheme. For an 8x8 scheme, every mul/fma is
  // replaced by two mul/fmas in AIE dialect. So we keep track of the pair.
  mlir::DenseMap<Operation *, Operation *> pairedOp;
  // If we fuse a representative mul/fma op with another fma op to exploit the
  // column topology of the AIE intrinsic, then cache, for the representative
  // op, the compile-time constant access distance between their two operands.
  // The first(second) offset of the pair represents the access distance
  // between the first(second) operands of the representative op and the the
  // fused op(s). This access distance will be used to compute the xstep/zstep
  // attribute.
  mlir::DenseMap<Operation *, std::pair<int32_t, int32_t>> opToColOffsets;
  // Map from the sext op to the def op of the sext operand.
  mlir::DenseMap<Operation *, Operation *> sextTruncDefMap;
  // A set of operations that are msc (fmsub) ops. We do not differentiate
  // between mac and msc ops at vector dialect level. The only op in vector
  // dialect is just FMA op.
  llvm::SmallSet<Operation *, 8> mscOps;
  // Used to build and insert all the new operations created.
  OpBuilder builder;
  // The shift val for ups and srs intinsics. This value should be between 0
  // and 63.
  int8_t shift;
  // The zero offset, indicating the position of recurring 0 in the input
  // filter. The counting starts at 1. For example, if the filter array is
  // {1,2,3,0,4,5,6,0,7,8,9,0}, then zeroOffset=4.
  int32_t zeroOffset;
  // The duplicate count, indicating the number of times a value is duplicated
  // in the filter. The filter values must be duplicated at least twice for the
  // i8xi8 scheme. An example of filter for i8xi8 scheme is {0,0,1,1,2,2,3,3},
  // with dupFactor=2.
  int32_t dupFactor;

  bool unalignedLoadsCheck, aieml;

  // Constructors
  VectState(MLIRContext *context, int8_t s, int32_t z, int32_t d,
            bool unalignedLoadsCheck, bool aieml)
      : builder(context), shift(s), zeroOffset(z), dupFactor(d),
        unalignedLoadsCheck(unalignedLoadsCheck), aieml(aieml) {}

  IntervalReuse *getIntervalForOperation(Operation *op);
};

// -----// IR Dump After AIEInnerUnroll (aie-inner-unroll) //----- //
module {
  func.func @conv2d(%arg0: memref<?x64xi16>, %arg1: memref<?x3xi16>, %arg2: memref<?x32xi16>) attributes {llvm.linkage = #llvm.linkage<external>} {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = aievec.upd %arg1[%c0, %c0] {index = 0 : i8, offset = 0 : i32} : memref<?x3xi16>, vector<16xi16>
    %c0_0 = arith.constant 0 : index
    %c32 = arith.constant 32 : index
    %c1 = arith.constant 1 : index
    scf.for %arg3 = %c0_0 to %c32 step %c1 {
      %c1_1 = arith.constant 1 : index
      %1 = arith.addi %arg3, %c1_1 : index
      %c2 = arith.constant 2 : index
      %2 = arith.addi %arg3, %c2 : index
      %c0_2 = arith.constant 0 : index
      %c32_3 = arith.constant 32 : index
      %c16 = arith.constant 16 : index
      %c32_4 = arith.constant 32 : index
      %3 = aievec.upd %arg0[%arg3, %c0_2] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
      %4 = aievec.upd %arg0[%arg3, %c0_2], %3 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
      %5 = aievec_aie1.mul %4, %0 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "13"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %7 = aievec.upd %arg0[%1, %c0_2] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
      %8 = aievec.upd %arg0[%1, %c0_2], %7 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
      %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "3", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %10 = aievec_aie1.mac %8, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "5", zstep = "10"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %11 = aievec.upd %arg0[%2, %c0_2] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
      %12 = aievec.upd %arg0[%2, %c0_2], %11 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
      %13 = aievec_aie1.mac %12, %0, %10 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %14 = aievec_aie1.mac %12, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "7"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %15 = aievec.srs %14, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
      vector.transfer_write %15, %arg2[%arg3, %c0_2] : vector<16xi16>, memref<?x32xi16>
      %c1_5 = arith.constant 1 : index
      %16 = arith.muli %c16, %c1_5 : index
      %17 = arith.addi %c0_2, %16 : index
      %18 = aievec.upd %arg0[%arg3, %17] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
      %19 = aievec.upd %arg0[%arg3, %17], %18 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
      %20 = aievec_aie1.mul %19, %0 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %21 = aievec_aie1.mac %19, %0, %20 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "13"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %22 = aievec.upd %arg0[%1, %17] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
      %23 = aievec.upd %arg0[%1, %17], %22 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
      %24 = aievec_aie1.mac %23, %0, %21 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "3", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %25 = aievec_aie1.mac %23, %0, %24 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "5", zstep = "10"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %26 = aievec.upd %arg0[%2, %17] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
      %27 = aievec.upd %arg0[%2, %17], %26 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
      %28 = aievec_aie1.mac %27, %0, %25 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %29 = aievec_aie1.mac %27, %0, %28 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "7"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
      %30 = aievec.srs %29, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
      vector.transfer_write %30, %arg2[%arg3, %17] : vector<16xi16>, memref<?x32xi16>
    }
    return
  }
}