Leon's Blog

分享一点有趣的技术

0%

MLIR-AIE Vectorization

image-20250616093524804

这篇博客主要解读AMD的mlir-aie开源项目针对vectorization提出的编译优化方法实现。主要关注aie-vectorize这个优化pass。

Pass Pipeline

VectorState

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// A struct to pack the global state required for vectorization at one place.
// Local to this translation unit.
struct VectState {
// A vector of all the reuse intervals created. Class IntervalReuse represents
// the cluster of data access (with reuse potential) along the vectorized
// dimension of each array, It clusters together reads that have a potential
// of vector-level data reuse. Therefore, array accesses A[i][j:j+8] and
// A[i+2][j:j+8] will map to different IntervalReuse objects.
SmallVector<IntervalReuse *, 16> reuseIntervals;
// Map from a transfer_read operation to the IntervalReuse object it belongs
// to.
mlir::DenseMap<Operation *, IntervalReuse *> opToIntervalMap;
// Map from a transfer_read operation to its linearized access expression.
// Linearized expression for access A[i][j], where A is of dimensionality MxN
// is (i*N+j). We assume that the innermost dimension is the vectorized
// dimension.
mlir::DenseMap<Operation *, AffineExpr> linearizedAccess;
// A map from an index (of array access) to an expr dim map (e.g., i->d0). We
// need this to create the correct linearized expressions for all the array
// accesses in the function.
mlir::DenseMap<Value, AffineExpr> indexToExprDimMap;
// For each transfer_read operation, a map from its container basic block to
// the enclosing for/while loops. This helps us identify two instructions
// that are nested together, even if they belong to different basic blocks.
mlir::DenseMap<Block *, SmallVector<Operation *, 8>> blockToEnclosingLoops;
// This is specific to 8x8 scheme. For an 8x8 scheme, every mul/fma is
// replaced by two mul/fmas in AIE dialect. So we keep track of the pair.
mlir::DenseMap<Operation *, Operation *> pairedOp;
// If we fuse a representative mul/fma op with another fma op to exploit the
// column topology of the AIE intrinsic, then cache, for the representative
// op, the compile-time constant access distance between their two operands.
// The first(second) offset of the pair represents the access distance
// between the first(second) operands of the representative op and the the
// fused op(s). This access distance will be used to compute the xstep/zstep
// attribute.
mlir::DenseMap<Operation *, std::pair<int32_t, int32_t>> opToColOffsets;
// Map from the sext op to the def op of the sext operand.
mlir::DenseMap<Operation *, Operation *> sextTruncDefMap;
// A set of operations that are msc (fmsub) ops. We do not differentiate
// between mac and msc ops at vector dialect level. The only op in vector
// dialect is just FMA op.
llvm::SmallSet<Operation *, 8> mscOps;
// Used to build and insert all the new operations created.
OpBuilder builder;
// The shift val for ups and srs intinsics. This value should be between 0
// and 63.
int8_t shift;
// The zero offset, indicating the position of recurring 0 in the input
// filter. The counting starts at 1. For example, if the filter array is
// {1,2,3,0,4,5,6,0,7,8,9,0}, then zeroOffset=4.
int32_t zeroOffset;
// The duplicate count, indicating the number of times a value is duplicated
// in the filter. The filter values must be duplicated at least twice for the
// i8xi8 scheme. An example of filter for i8xi8 scheme is {0,0,1,1,2,2,3,3},
// with dupFactor=2.
int32_t dupFactor;

bool unalignedLoadsCheck, aieml;

// Constructors
VectState(MLIRContext *context, int8_t s, int32_t z, int32_t d,
bool unalignedLoadsCheck, bool aieml)
: builder(context), shift(s), zeroOffset(z), dupFactor(d),
unalignedLoadsCheck(unalignedLoadsCheck), aieml(aieml) {}

IntervalReuse *getIntervalForOperation(Operation *op);
};

Interval Reuse

整个向量化的一大重点是向量寄存器复用合并。这部分在源码中主要数据结构在IntervalReuse.cppIntervalReuse.h两个文件中。这一小节重点分析间断重用分析算法的具体实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
//===- IntervalReuse.h - AIE Vector Data Reuse Computation ------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2022 Xilinx Inc.
//
//===----------------------------------------------------------------------===//
// Class to compute potential data reuse in AIE vectors
//===----------------------------------------------------------------------===//

#ifndef AIE_DIALECT_AIEVEC_TRANSFORMS_INTERVALREUSE_H
#define AIE_DIALECT_AIEVEC_TRANSFORMS_INTERVALREUSE_H

#include "aie/Dialect/AIEVec/AIEVecUtils.h"

#include <utility>

namespace xilinx::aievec {

// 这个class是AMD提供的一个很重要的优化点,重用interval。这个优化点在vyasa中提出。
// The IntervalReuse class.
// This class captures the potential data reuse in the AIE vector. Each load
// from memory (coming from transfer_read op) will exclusively belong to one of
// the IntervalReuse object. Note that in AIE, the minimum vector size is
// 128-bit, and they are aligned to the vector size boundary.
// Let A and B be NxN arrays of 32-bit values. We assume no aliasing. Consider
// the following three cases, where we want to determine which IntervalReuse
// object the two read op accesses within the same loop nest will be mapped to:
// 1. Accesses A[i][j:j+8] and A[i][j+1:j+9]: they exhibit data reuse if we load
// the range A[j:j+16] from memory into a 512-bit vector. Therefore, these two
// read ops will belong to the same IntervalReuse object.
// 2. Accesses A[i][j:j+8] and A[i+1][j:j+8]: there is no non-trivial
// vector-level reuse possible for these accesses, and they must belong to
// different IntervalReuse objects.
// 3. Accesses A[i][j:j+8] and B[i][j:j+8]: there is no possible reuse, since
// the read ops access different arrays. Therefore, these two read ops must
// belong to different IntervalReuse objects.
// We want to correctly map read ops to IntervalReuse object in all these three
// cases.
//
// 理解Interval分析中的base以及memref的含义
// 只有同样的memref和base的访问才有可能重用数据。memref是指向数组的指针,base是指向数组元素的偏移量。
// The class contains 'memref' and 'base' to disambiguate reuse. 'memref'
// differentiates different arrays. For example, 'memref' for arrays A and B
// will be different. For accesses coming from the same array, 'base' helps in
// disambiguation. 'base' is just the linearized base expression of the access.
// The linearized expression for A[i][j] is i*N+j. We decompose it into base
// (i*N+j), and offset 0. In contrast, the linearized expression for
// A[i+1][j+2] is (i+1)*N+j+2, and we decompose it into base (i+1)*N+j, and
// offset 2. Basically, we abstract away the constant offset from the
// linearized access expression to form the base.
// Given a vector of IntervalReuse objects, we just search for an object with
// matching 'memref' and 'base' to group the read ops that can potentially
// reuse data in vector.
//
// 记录读取了多大数据范围
// 'extentMap' stores the extent of data that an op reads. We store the extent
// in bits. For example, the extent for operation reading A[i][j:j+8] is
// [0,256].
// load会强制aligned到512位边界
// The 'read access extent' corresponds to the aligned chunk of data that an
// operation loads. For example, an 8-lane, 32-bit vector load from A[i+7:i+15]
// would have read access extent [0:512], whereas under same conditions, the
// vector load from A[i+9:i+17] would have read access extent [512:768]. Note
// how the extents are aligned to vector size boundary.
//
// 'intervals' merges overlapping intervals to give the view of actual AIE
// vectors that need to be created. Since AIE only allows aligned vector loads,
// each interval is aligned to vector size. Continuing with the previous
// example, the two extents [0,256] and [128,512] overlap. Therefore these will
// be merged together to form a single interval [0,512]. The entries into
// 'intervals' are sorted, so given an op, we can find its interval by doing
// binary search with the op's extent as key (e.g, searching for [128,256] in
// {[0,512],[512,1024]}).

class IntervalReuse {
// differentiate arrays (e.g., A vs. B)
// memref用来区别A和B两个数组
mlir::Value memref;
// differentiate accesses coming from the same array, but with different base
// expression along the non-vectorized dimension (e.g., A[i+1][j:j+8] vs.
// A[i][j:j+8];
// 同一数组的不同访问,base是线性化的表达式
mlir::AffineExpr base;
// A map from each read operation to the extent of bits it reads (aligned to
// vector size).
// 读操作的extent是以位为单位的,aligned到vector size边界(512bits)
llvm::DenseMap<mlir::Operation *, std::pair<int32_t, int32_t>> extentMap;
// 无overlap的intervals列表
// Disjoint intervals of all the data accesses (i.e., read bits). Each
// interval entry corresponds to memory load into an AIE vec.
llvm::SmallVector<std::pair<int32_t, int32_t>, 8> intervals;
// lmul为例,LHS是RHS的两倍
// Identify all the vectors that are only used as LHS operands of mul/mac op.
// The LHS operand of mul/mac ops have specific size requirement.
llvm::SmallVector<bool, 8> vecIsLHSOperand;

// Return true if this array access comes from the same array
bool sameMemRef(mlir::Value m) { return memref == m; }
// Return true if this array access has the same invariant base
// expression.
bool sameInvariantIndices(mlir::AffineExpr b) { return base == b; }
// 目前只有同一loop下的operation才会去做overlap分析。
// Return true if this array access is enclosed within the same loop nest as
// other accesses belonging to the same IntervalReuse object.
bool sameEnclosingLoops(
mlir::Operation *op,
llvm::DenseMap<mlir::Block *, llvm::SmallVector<mlir::Operation *, 8>>
&blockToEnclosingLoops);

// 显示维护intervals,operation通过索引的方式找寻到对应的interval
// For an operation, get the index into intervals that subsumes the
// operation's access extent.
size_t getIntervalIndex(mlir::Operation *op);

public:
// Return true if this read operation has a potential data reuse with other
// read operations in this IntervalReuse.
// 判断该operation在当前同一loop下是否有可重用寄存器
bool potentialReuse(
mlir::vector::TransferReadOp readOp, mlir::AffineExpr invariantBase,
llvm::DenseMap<mlir::Block *, llvm::SmallVector<mlir::Operation *, 8>>
&blockToEnclosingLoops);
// Insert the access extent of this read operation into intervals
// 必须对齐到AIE向量最小size:128bits
void insertInterval(mlir::vector::TransferReadOp readOp,
llvm::DenseMap<mlir::Operation *, IntervalReuse *>
&dataAccessToIntervalMap,
int32_t offset, int32_t forLoopStepSize,
bool isSplat = false,
unsigned minVecSize = 128 /*min AIE vec size*/);
// 获取一个operation的访问范围,应该要做对齐
// For a read operation, return the width of the interval its access extent
// belongs to. The interval width corresponds to the size of the vector that
// will hold the load from read operation.
int32_t getIntervalWidth(mlir::Operation *op);
// Get the read access extent of this read operation. The returned value
// indicates the start and end offsets of the access from the base (in bits).
std::pair<int32_t, int32_t> getAccessExtent(mlir::Operation *op);
// Set the read access extent of this read operation.
void setAccessExtent(mlir::Operation *op,
std::pair<int32_t, int32_t> &extent);
// Get the interval that contains this read operation's extent
std::pair<int32_t, int32_t> getInterval(mlir::Operation *op);
// Given that the read operation 'op' is only LHS operand of some mul/mac
// op, mark the vector that will load its access extent.
void markLHSOperandVec(mlir::Operation *op);
// If the interval corresponds to a vector that is marked as the exclusive
// LHS operand of some mul/mac op, and if its size is <= 256, try to coalesce
// it with the next interval.
// interval相临,并且mac/mul操作的LHS操作数是256位的向量时,尝试合并
void coalesceIntervals();
// Constructors
IntervalReuse(mlir::vector::TransferReadOp readOp, mlir::AffineExpr b)
: memref(readOp.getSource()), base(b) {}
IntervalReuse() : memref(nullptr), base(nullptr) {}
};

} // namespace xilinx::aievec
// end namespace xilinx

#endif // AIE_DIALECT_AIEVEC_TRANSFORMS_INTERVALREUSE_H

VecState

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
struct VectState {
// A vector of all the reuse intervals created. Class IntervalReuse represents
// the cluster of data access (with reuse potential) along the vectorized
// dimension of each array, It clusters together reads that have a potential
// of vector-level data reuse. Therefore, array accesses A[i][j:j+8] and
// A[i+2][j:j+8] will map to different IntervalReuse objects.
SmallVector<IntervalReuse *, 16> reuseIntervals;
// Map from a transfer_read operation to the IntervalReuse object it belongs
// to.
mlir::DenseMap<Operation *, IntervalReuse *> opToIntervalMap;
// Map from a transfer_read operation to its linearized access expression.
// Linearized expression for access A[i][j], where A is of dimensionality MxN
// is (i*N+j). We assume that the innermost dimension is the vectorized
// dimension.
mlir::DenseMap<Operation *, AffineExpr> linearizedAccess;
// A map from an index (of array access) to an expr dim map (e.g., i->d0). We
// need this to create the correct linearized expressions for all the array
// accesses in the function.
mlir::DenseMap<Value, AffineExpr> indexToExprDimMap;
// For each transfer_read operation, a map from its container basic block to
// the enclosing for/while loops. This helps us identify two instructions
// that are nested together, even if they belong to different basic blocks.
mlir::DenseMap<Block *, SmallVector<Operation *, 8>> blockToEnclosingLoops;
// This is specific to 8x8 scheme. For an 8x8 scheme, every mul/fma is
// replaced by two mul/fmas in AIE dialect. So we keep track of the pair.
mlir::DenseMap<Operation *, Operation *> pairedOp;
// If we fuse a representative mul/fma op with another fma op to exploit the
// column topology of the AIE intrinsic, then cache, for the representative
// op, the compile-time constant access distance between their two operands.
// The first(second) offset of the pair represents the access distance
// between the first(second) operands of the representative op and the the
// fused op(s). This access distance will be used to compute the xstep/zstep
// attribute.
mlir::DenseMap<Operation *, std::pair<int32_t, int32_t>> opToColOffsets;
// Map from the sext op to the def op of the sext operand.
mlir::DenseMap<Operation *, Operation *> sextTruncDefMap;
// A set of operations that are msc (fmsub) ops. We do not differentiate
// between mac and msc ops at vector dialect level. The only op in vector
// dialect is just FMA op.
llvm::SmallSet<Operation *, 8> mscOps;
// Used to build and insert all the new operations created.
OpBuilder builder;
// The shift val for ups and srs intinsics. This value should be between 0
// and 63.
int8_t shift;
// The zero offset, indicating the position of recurring 0 in the input
// filter. The counting starts at 1. For example, if the filter array is
// {1,2,3,0,4,5,6,0,7,8,9,0}, then zeroOffset=4.
int32_t zeroOffset;
// The duplicate count, indicating the number of times a value is duplicated
// in the filter. The filter values must be duplicated at least twice for the
// i8xi8 scheme. An example of filter for i8xi8 scheme is {0,0,1,1,2,2,3,3},
// with dupFactor=2.
int32_t dupFactor;

bool unalignedLoadsCheck, aieml;

// Constructors
VectState(MLIRContext *context, int8_t s, int32_t z, int32_t d,
bool unalignedLoadsCheck, bool aieml)
: builder(context), shift(s), zeroOffset(z), dupFactor(d),
unalignedLoadsCheck(unalignedLoadsCheck), aieml(aieml) {}

IntervalReuse *getIntervalForOperation(Operation *op);
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
// -----// IR Dump After AIEInnerUnroll (aie-inner-unroll) //----- //
module {
func.func @conv2d(%arg0: memref<?x64xi16>, %arg1: memref<?x3xi16>, %arg2: memref<?x32xi16>) attributes {llvm.linkage = #llvm.linkage<external>} {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = aievec.upd %arg1[%c0, %c0] {index = 0 : i8, offset = 0 : i32} : memref<?x3xi16>, vector<16xi16>
%c0_0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
scf.for %arg3 = %c0_0 to %c32 step %c1 {
%c1_1 = arith.constant 1 : index
%1 = arith.addi %arg3, %c1_1 : index
%c2 = arith.constant 2 : index
%2 = arith.addi %arg3, %c2 : index
%c0_2 = arith.constant 0 : index
%c32_3 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c32_4 = arith.constant 32 : index
%3 = aievec.upd %arg0[%arg3, %c0_2] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
%4 = aievec.upd %arg0[%arg3, %c0_2], %3 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
%5 = aievec_aie1.mul %4, %0 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "13"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%7 = aievec.upd %arg0[%1, %c0_2] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
%8 = aievec.upd %arg0[%1, %c0_2], %7 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
%9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "3", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%10 = aievec_aie1.mac %8, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "5", zstep = "10"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%11 = aievec.upd %arg0[%2, %c0_2] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
%12 = aievec.upd %arg0[%2, %c0_2], %11 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
%13 = aievec_aie1.mac %12, %0, %10 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%14 = aievec_aie1.mac %12, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "7"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%15 = aievec.srs %14, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
vector.transfer_write %15, %arg2[%arg3, %c0_2] : vector<16xi16>, memref<?x32xi16>
%c1_5 = arith.constant 1 : index
%16 = arith.muli %c16, %c1_5 : index
%17 = arith.addi %c0_2, %16 : index
%18 = aievec.upd %arg0[%arg3, %17] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
%19 = aievec.upd %arg0[%arg3, %17], %18 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
%20 = aievec_aie1.mul %19, %0 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%21 = aievec_aie1.mac %19, %0, %20 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "13"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%22 = aievec.upd %arg0[%1, %17] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
%23 = aievec.upd %arg0[%1, %17], %22 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
%24 = aievec_aie1.mac %23, %0, %21 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "3", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%25 = aievec_aie1.mac %23, %0, %24 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "5", zstep = "10"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%26 = aievec.upd %arg0[%2, %17] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
%27 = aievec.upd %arg0[%2, %17], %26 {index = 1 : i8, offset = 256 : i32} : memref<?x64xi16>, vector<32xi16>
%28 = aievec_aie1.mac %27, %0, %25 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%29 = aievec_aie1.mac %27, %0, %28 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "7"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
%30 = aievec.srs %29, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
vector.transfer_write %30, %arg2[%arg3, %17] : vector<16xi16>, memref<?x32xi16>
}
return
}
}