note01
test_ops 测试流程疏理
包含单个算子的整个测试流程代码
def test_batch_norm():
# 定义数据载体相当于
data = relay.var("data", shape=(2, 7, 4, 3))
# 用relay构成网络(计算图构成)
net = batch_norm_infer(data)[0]
# 分析网络的自由参数(输入参数)
args = relay.analysis.free_vars(net)
# 将net和args组合成一个小子图
subgraph = relay.Function(args, net)
# 并进行类型推导(为什么之前不行?因为必须加入data)
subgraph = testing.run_infer_type(subgraph)
# 进行反向传播的转换
reverse = transform.gradient(subgraph, mode="first_order")
# 编译或者说解析得到可执行模块
# vm就是可执行模块
backward, vm = nn_compile(reverse)
# 数据载体赋予实际数据
data_np = np.random.uniform(0, 10, size=(2, 7, 4, 3)).astype("float32")
label_np = np.random.uniform(0, 10, size=(2, 7, 4, 3)).astype("float32")
# 运行获取输出。
outputs = nn_exec(backward, vm, (2, 7, 4, 3), data_np=data_np, label_np=label_np)
# 打印输出。
print(outputs[1])
return outputs
具体疏理
params都是哪来的的跟踪
用pdb打了个断点跟踪了一下,从中间结果大概理解一下都在干了些什么
-> return backward, vm
(Pdb)
(Pdb)
(Pdb) print(backward)
def @main(%param_data: Tensor[(2, 7, 4, 3), float32], %param_bn_gamma: Tensor[(7), float32], %param_bn_beta: Tensor[(7), float32], %param_bn_moving_mean: Tensor[(7), float32], %param_bn_moving_var: Tensor[(7), float32]) {
%40 = fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32], Primitive=1, Compiler="xpucompiler", global_symbol="xpu") {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
可以看到可执行模块是一个包裹了需要运行的内容(包裹了fn)的main函数
它的参数有param_data param_bn_gamma param_bn_beta param_bn_moving_mean param_bn_moving_var
然后参数是怎么来的经过探索
param_bn_gamma param_bn_beta param_bn_moving_mean param_bn_moving_var都是batch_norm_infer中加上的
而对于剩下的param_data,
¥@……@¥%……
总之,
data就是data = relay.var("data") (从而有了一个relay.var叫"data")
然后net = batch_norm_infer(data)[0]使得net就把那个叫"data"的relay.var参数作为输入参数了。所以就有了一个名字
然后batch_norm_infer里面还加入了bn_gamma, bn_beta, bn_moving_mean, bn_moving_var
他们也都是batch_norm_infer里面构建的relay.var,然后被赋予了上面这些名字。
那"param_"这个前缀哪来的呢?
是在之后的nn.compile(reverse)
nn.compile里面会有内容的重组,包括把参数都重新命名了一遍,就是取出了参数加上了这个param_前缀。
模型IR在其中经过了什么变化
# net = batch_norm_infer(data)[0]
# print(net)
free_var %data: Tensor[(2, 7, 4, 3), float32];
free_var %bn_gamma;
free_var %bn_beta;
free_var %bn_moving_mean;
free_var %bn_moving_var;
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var);
%0.0
# 在relay的语法中,"xxx.0"表示xxx是一个元组(列表)之类的东西,然后取出元组中的第一项。
# 另外写在最后的一句表示返回值。
# args = relay.analysis.free_vars(net)
# print(args)
[Var(data, ty=TensorType([2, 7, 4, 3], float32)), Var(bn_gamma), Var(bn_beta), Var(bn_moving_mean), Var(bn_moving_var)]
# subgraph = relay.Function(args, net)
# print(subgraph)
fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var);
%0.0
}
# subgraph = testing.run_infer_type(subgraph)
# print(subgraph)
fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32]) -> Tensor[(2, 7, 4, 3), float32] {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%0.0
}
# 可以看到已经正确推导出了类型。是根据已有的类型加上参数之间的类型联系得到的。LayerNormRel里面也是根据data的shape得到其他参数的shape
# reverse = transform.gradient(subgraph, mode="first_order")
# print(reverse)
fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32]) -> (Tensor[(2, 7, 4, 3), float32], (Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32], Tensor[(7), float32], Tensor[(7), float32])) {
let %x = %data;
let %x1 = zeros_like(%x);
let %x2 = %bn_gamma;
let %x3 = zeros_like(%x2);
let %x4 = %bn_beta;
let %x5 = zeros_like(%x4);
let %x6 = %bn_moving_mean;
let %x7 = zeros_like(%x6);
let %x8 = %bn_moving_var;
let %x9 = zeros_like(%x8);
%0 = nn.batch_norm(%x, %x2, %x4, %x6, %x8) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
let %x10 = %0;
%1 = zeros(shape=[2, 7, 4, 3], dtype="float32");
%2 = zeros(shape=[7], dtype="float32");
%3 = zeros(shape=[7], dtype="float32");
let %x11 = (%1, %2, %3);
let %x12 = %x10.0;
let %x13 = zeros_like(%x12);
%4 = %x11.0;
%5 = ones_like(%x12);
%17 = (
let %x14 = add(%4, %5);
%6 = %x11.1;
%7 = %x11.2;
let %x15 = (%x14, %6, %7);
%8 = %x15.0;
%9 = %0.1;
%10 = %0.2;
%11 = nn.batch_norm_grad(%x, %8, %x2, %9, %10);
%12 = %11.0;
let %x16 = add(%x1, %12);
%13 = %11.1;
let %x17 = add(%x3, %13);
%14 = %11.2;
let %x18 = add(%x5, %14);
%15 = zeros_like(%x6);
let %x19 = add(%x7, %15);
%16 = zeros_like(%x8);
let %x20 = add(%x9, %16);
(%x16, %x17, %x18, %x19, %x20)
);
(%x12, %17)
}
# 可以看到里面既有batch_norm又有batch_norm_grad的调用。
# 内容增多了。相当于原来subgraph里IR是正向传播(forward)
# 现在reverse里的IR还补上了反向传播(backward)
# 所以transform.gradient中transform的意思是“改造”
# 改造为附加上反向传播的IR
# 反向传播的部分怎么得到的?
# 对应了_tensor_grad.py中产生梯度的部分
# backward, vm = nn_compile(reverse)
# step into nn_compile:
# param_args = ....
# print(param_args)
[Var(param_data, ty=TensorType([2, 7, 4, 3], float32)), Var(param_bn_gamma, ty=TensorType([7], float32)), Var(param_bn_beta, ty=TensorType([7], float32)), Var(param_bn_moving_mean, ty=TensorType([7], float32)), Var(param_bn_moving_var, ty=TensorType([7], float32))]
# 确实还是那些参数,但参数名加上了param_前缀
# reverseIR = tvm.IRModule.from_expr(reverse)
# print(reverseIR)
def @main(%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32]) -> (Tensor[(2, 7, 4, 3), float32], (Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32], Tensor[(7), float32], Tensor[(7), float32])) {
let %x = %data;
let %x1 = zeros_like(%x);
let %x2 = %bn_gamma;
let %x3 = zeros_like(%x2);
let %x4 = %bn_beta;
let %x5 = zeros_like(%x4);
let %x6 = %bn_moving_mean;
let %x7 = zeros_like(%x6);
let %x8 = %bn_moving_var;
let %x9 = zeros_like(%x8);
%0 = nn.batch_norm(%x, %x2, %x4, %x6, %x8) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
let %x10 = %0;
%1 = zeros(shape=[2, 7, 4, 3], dtype="float32");
%2 = zeros(shape=[7], dtype="float32");
%3 = zeros(shape=[7], dtype="float32");
let %x11 = (%1, %2, %3);
let %x12 = %x10.0;
let %x13 = zeros_like(%x12);
%4 = %x11.0;
%5 = ones_like(%x12);
%17 = (
let %x14 = add(%4, %5);
%6 = %x11.1;
%7 = %x11.2;
let %x15 = (%x14, %6, %7);
%8 = %x15.0;
%9 = %0.1;
%10 = %0.2;
%11 = nn.batch_norm_grad(%x, %8, %x2, %9, %10);
%12 = %11.0;
let %x16 = add(%x1, %12);
%13 = %11.1;
let %x17 = add(%x3, %13);
%14 = %11.2;
let %x18 = add(%x5, %14);
%15 = zeros_like(%x6);
let %x19 = add(%x7, %15);
%16 = zeros_like(%x8);
let %x20 = add(%x9, %16);
(%x16, %x17, %x18, %x19, %x20)
);
(%x12, %17)
}
# 从reverse中的fn变成了reverseIR中的def @main
# 也许这就是from_expr的作用。
# from_expr在官方文档中的解释为:Construct a module from a standalone expression.
# 也许@main才是一个module,一个可调用的module。
# reverseIR = relay.transform.ToGraphNormalForm()(reverseIR)
# print(reverseIR)
def @main(%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32]) -> (Tensor[(2, 7, 4, 3), float32], (Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32], Tensor[(7), float32], Tensor[(7), float32])) {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
(%1, %31)
}
# 与之前的reverse相比没有了let,确实更像一个图的形式(所谓的ToGraphNormalForm)
# reverse = reverseIR["main"].body
# print(reverse)
free_var %data: Tensor[(2, 7, 4, 3), float32];
free_var %bn_gamma: Tensor[(7), float32];
free_var %bn_beta: Tensor[(7), float32];
free_var %bn_moving_mean: Tensor[(7), float32];
free_var %bn_moving_var: Tensor[(7), float32];
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
(%1, %31)
# 就是reverseIR中main部分的IR。里面还带了free_vars
# 之前谁也带了free_vars?没错就是net
# 所以现在reverse也和之前的net一样是一个“图IR”,而且这个图IR是带梯度计算的。
# loss = relay.TupleGetItem(reverse, 0)
# print(loss)
free_var %data: Tensor[(2, 7, 4, 3), float32];
free_var %bn_gamma: Tensor[(7), float32];
free_var %bn_beta: Tensor[(7), float32];
free_var %bn_moving_mean: Tensor[(7), float32];
free_var %bn_moving_var: Tensor[(7), float32];
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
%32 = (%1, %31);
%32.0
# 这里相当于一个小小的transform(“改造”)
# 对比reverse和loss可以发现
# loss的IR和reverse的IR就是reverse的IR加上取Tuple的IR(!)
# grad = relay.TupleGetItem(reverse, 1)
# print(grad)
free_var %data: Tensor[(2, 7, 4, 3), float32];
free_var %bn_gamma: Tensor[(7), float32];
free_var %bn_beta: Tensor[(7), float32];
free_var %bn_moving_mean: Tensor[(7), float32];
free_var %bn_moving_var: Tensor[(7), float32];
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
%32 = (%1, %31);
%32.1
# 与loss同理
# compute = []
# for idx in range(0, len(param_args)):
# compute.append(relay.TupleGetItem(grad, idx))
# compute.insert(0, loss)
# 自然这里就可以明白
# 解释如下
# 首先reverse是transform.gradient(subgraph)得来的IR
# 而transform.gradient想打造一个什么样的函数呢(也就是说得到的reverse里的IR是在干什么事情呢)
# 就是返回值是一个Tuple,第一个是loss,第二个是grad
# transform.gradient的文档也是这么说的
# Transform the input function, returning a function that calculate the original result, paired with gradient of the input.
# 也就是说,transform.gradient(subgraph)里的IR内容(即reverse里的IR内容)所描述的事情是:计算原函数的结果和对给定输入的梯度,返回一个原函数结果和梯度的二元组。
#
# (所以根据文档来看,loss这个参数名不是很合理,应改为ori_result)
# 所以,loss = relay.TupleGetItem(reverse, 0)这一句意思是:由于reverse的IR所描述的内容是返回一个(original result, gradient)的二元组,所以如果在reverse的IR里加上一段描述从Tuple里GetItem的内容,所得到的IR就是在描述怎么计算original_result。
# 与loss同理,grad = relay.TupleGetItem(reverse, 1)这一句意思是要(通过在reverse的IR里加一段取Tuple中内容的IR),以获得描述怎么计算grad的IR。
# compute = []
# for idx in range(0, len(param_args)):
# compute.append(relay.TupleGetItem(grad, idx))
# compute.insert(0, loss)
# 这一段代码也同理
# 因为grad也是一个元组,元组里面分别是对三个参数的梯度,所以也要加一段描述从grad这个Tuple里取内容的IR来得到计算每个参数的梯度的IR。
# 最后再铺开来拼一起。
# 再把这些IR通过relay.Funtion(args, reverse)包装成一个函数。
# 这个函数的IR所描述的内容就是返回一个元组,里面分别是计算所得的(orig_result, grad_of_param1, grad_of_param2, grad_of_param3)
# 然后看之后的代码,发现
# 最后还把relay.Funtion包装成了relay.Call(reverse, param_args)
# 再包装成backward = relay.Function(param_arg, call)
# 我们也顺带展示了一下
# reverse = relay.Tuple(compute)
# 经过前面这么一大段,我们知道,现在reverse也还是一个图IR,
# 其返回值是一个元组,元组里面各项是计算所得(怎么计算也在reverse的IR里描述)的
# original_result, grad_of_param1, grad_of_param2, grad_of_param3
# args = relay.analysis.free_vars(reverse)
# reverse = relay.Function(args, reverse)
# reverse = set_external_func_attr(reverse, "xpucompiler", "xpu") 下面会看到确实加上了Primitive Compiler global_symbol三个func_attr
# 到这里为止,经历了从net这个图IR被转换为FuntionIR又ToGraphNormalForm后取body(取的body变成了图IR),
# 各种拼装之后又回到了FuntionIR。
# print(reverse)
fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32], Primitive=1, Compiler="xpucompiler", global_symbol="xpu") {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
%32 = (%1, %31);
%33 = %32.0;
%34 = %32.1;
%35 = %34.0;
%36 = %34.1;
%37 = %34.2;
%38 = %34.3;
%39 = %34.4;
(%33, %35, %36, %37, %38, %39)
}
# call = relay.Call(reverse, param_args)
# print(call)
free_var %param_data: Tensor[(2, 7, 4, 3), float32];
free_var %param_bn_gamma: Tensor[(7), float32];
free_var %param_bn_beta: Tensor[(7), float32];
free_var %param_bn_moving_mean: Tensor[(7), float32];
free_var %param_bn_moving_var: Tensor[(7), float32];
%40 = fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32], Primitive=1, Compiler="xpucompiler", global_symbol="xpu") {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
%32 = (%1, %31);
%33 = %32.0;
%34 = %32.1;
%35 = %34.0;
%36 = %34.1;
%37 = %34.2;
%38 = %34.3;
%39 = %34.4;
(%33, %35, %36, %37, %38, %39)
};
%40(%param_data, %param_bn_gamma, %param_bn_beta, %param_bn_moving_mean, %param_bn_moving_var)
# 可以看到,对比reverse,前后都多加了,前多加了带param_的free_vars,后多加了对之前组装成函数的那个reverse的调用,但现在又是“散装”状的图IR了。于是之后backward = relay.Funtion(param_args, call)又组装成Function。
# backward = relay.Function(param_args, call)
# print(backward)
fn (%param_data: Tensor[(2, 7, 4, 3), float32], %param_bn_gamma: Tensor[(7), float32], %param_bn_beta: Tensor[(7), float32], %param_bn_moving_mean: Tensor[(7), float32], %param_bn_moving_var: Tensor[(7), float32]) {
%40 = fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32], Primitive=1, Compiler="xpucompiler", global_symbol="xpu") {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
%32 = (%1, %31);
%33 = %32.0;
%34 = %32.1;
%35 = %34.0;
%36 = %34.1;
%37 = %34.2;
%38 = %34.3;
%39 = %34.4;
(%33, %35, %36, %37, %38, %39)
};
%40(%param_data, %param_bn_gamma, %param_bn_beta, %param_bn_moving_mean, %param_bn_moving_var)
}
# backward = tvm.IRModule.from_expr(backward)
# 是在继续把上一句组装成的Function转为一个module,即def @main
# print(backward)
def @main(%param_data: Tensor[(2, 7, 4, 3), float32], %param_bn_gamma: Tensor[(7), float32], %param_bn_beta: Tensor[(7), float32], %param_bn_moving_mean: Tensor[(7), float32], %param_bn_moving_var: Tensor[(7), float32]) {
%40 = fn (%data: Tensor[(2, 7, 4, 3), float32], %bn_gamma: Tensor[(7), float32], %bn_beta: Tensor[(7), float32], %bn_moving_mean: Tensor[(7), float32], %bn_moving_var: Tensor[(7), float32], Primitive=1, Compiler="xpucompiler", global_symbol="xpu") {
%0 = nn.batch_norm(%data, %bn_gamma, %bn_beta, %bn_moving_mean, %bn_moving_var) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%1 = %0.0;
%2 = zeros_like(%data) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%3 = zeros(shape=[2, 7, 4, 3], dtype="float32") /* ty=Tensor[(2, 7, 4, 3), float32] */;
%4 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%5 = zeros(shape=[7], dtype="float32") /* ty=Tensor[(7), float32] */;
%6 = (%3, %4, %5);
%7 = %6.0;
%8 = ones_like(%1) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%9 = add(%7, %8) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%10 = %6.1;
%11 = %6.2;
%12 = (%9, %10, %11);
%13 = %12.0;
%14 = %0.1;
%15 = %0.2;
%16 = nn.batch_norm_grad(%data, %13, %bn_gamma, %14, %15) /* ty=(Tensor[(2, 7, 4, 3), float32], Tensor[(7), float32], Tensor[(7), float32]) */;
%17 = %16.0;
%18 = add(%2, %17) /* ty=Tensor[(2, 7, 4, 3), float32] */;
%19 = zeros_like(%bn_gamma) /* ty=Tensor[(7), float32] */;
%20 = %16.1;
%21 = add(%19, %20) /* ty=Tensor[(7), float32] */;
%22 = zeros_like(%bn_beta) /* ty=Tensor[(7), float32] */;
%23 = %16.2;
%24 = add(%22, %23) /* ty=Tensor[(7), float32] */;
%25 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%26 = zeros_like(%bn_moving_mean) /* ty=Tensor[(7), float32] */;
%27 = add(%25, %26) /* ty=Tensor[(7), float32] */;
%28 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%29 = zeros_like(%bn_moving_var) /* ty=Tensor[(7), float32] */;
%30 = add(%28, %29) /* ty=Tensor[(7), float32] */;
%31 = (%18, %21, %24, %27, %30);
%32 = (%1, %31);
%33 = %32.0;
%34 = %32.1;
%35 = %34.0;
%36 = %34.1;
%37 = %34.2;
%38 = %34.3;
%39 = %34.4;
(%33, %35, %36, %37, %38, %39)
};
%40(%param_data, %param_bn_gamma, %param_bn_beta, %param_bn_moving_mean, %param_bn_moving_var)
}
# target = "llvm"
# ctx = tvm.cpu()
# with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout", "SimplifyInference"]):
# exe = relay.vm.compile(backward, target=target)
# code, lib = exe.save()
# lib = update_lib(lib)
# exe = runtime.vm.Executable.load_exec(code, lib)
# vm = runtime.vm.VirtualMachine(exe, ctx)
# print(backward)
# pdb.set_trace()
# 之后这些代码就是在编译刚才的module--backward得到exe(可执行模块)
# 然后vm是装载了这个可执行模块的“虚拟机”
# 之后只需要给vm提供实际的数据参数即可运行。
# 见outputs = vm.run(**mod_params)通过解开字典的方式把要提供的带实际数据的参数逐个送进去。
还可以发现,subgraph = relay.Funtion(args, net)是从图IR转为FunctionIR
之后才能subgraph = testing.run_infer_type(subgraph)分析参数形状具体确定参数类型;
而reverseIR = tvm.IRModule.from_expr(reverse)之后又reverseIR = relay.transform.ToGraphNormalForm()(reverseIR)是把FuntionIR打散为图IR。为什么要用from_expr是因为要转为def @main函数,这样才能方便指定reverse = reverseIR["main"].body里指定"main"的body取得函数体的IR。才能方便后面加上Tuple GetItem的IR等IR改造。
nn_exec里面其实没有什么东西了,就一句outputs = vm.run(**mod_params)。
其他的都是在设置mod_params这个字典里的参数。
而且nn_exec里设置mod_params里加了"param_label"这一项,其实很多时候是没有这个参数的,打印backward就知道了。
另外还可以发现,transform.gradient里面反向传播的是ones_like。即根梯度是全一。所以我们在用pytorch比对的时候也要这样。
一些debug时候进行的改动
我因为写layer_norm_grad的需要,需要从layer_norm获取另外两个计算结果,就是mean和var
而本来的layer_norm是只有一个结果的,就是被Normalized的数据,现在要从layer_norm获取
normalized_data, mean, var这三个数据
需要做的(在源码上改动的)有:一是C++那边类型推导的时候把返回值的类型改了,即LayerNormRel
二是relay.nn.layer_norm在包装_make.layer_norm(即C++那边的编译后的函数的包装)时需要加
result = expr.TupleWrapper(result, 3)把结果包装起来。像relay.nn.batch_norm所做的包装一样。
三是pytorch.py里面在进行图算子转换的时候,对_op.nn.layer_norm(xxx)应改为_op.nn.layer_norm(xxx)[0]
因为本来layer_norm计算结果只有一个的,现在被包装成三个了。所以应该用下标[0]指明取原来的那个
另外还有学长写的dropout出问题,他写的dropout多加了一个参数,好像导入bert的时候和其他
部分有冲突,所以照原来的tvm中dropout的写法改回去了。
test_batch_norm里一个把是shape改了(改成4个维度都不同,为了print IR 时查看类型推导shape的情况)
也加入了一个pdb断点,这才有了上面逐行跟踪IR变化情况的笔记。
bert源码笔记