binaryop的解读
binaryop是用来二元计算的op,看binaryop.h的内容:
class BinaryOp : public Layer
{
public:
BinaryOp();
virtual int load_param(const ParamDict& pd);
using Layer::forward;
using Layer::forward_inplace;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
enum OperationType
{
Operation_ADD = 0,
Operation_SUB = 1,
Operation_MUL = 2,
Operation_DIV = 3,
Operation_MAX = 4,
Operation_MIN = 5,
Operation_POW = 6,
Operation_RSUB = 7,
Operation_RDIV = 8
};
public:
// param
int op_type;
int with_scalar;
float b;
};
枚举OperationType下面就是此op的所有计算类型了。
接着看binaryop.cpp的构造函数:
BinaryOp::BinaryOp()
{
one_blob_only = false;
support_inplace = false;
}
构造函数里面被初始化成多输入多输出以及不能就地运算。
我们再来看参数装载函数:
int BinaryOp::load_param(const ParamDict& pd)
{
op_type = pd.get(0, 0);
with_scalar = pd.get(1, 0);
b = pd.get(2, 0.f);
//如果是标量则为单输入单输出且为就地运算
if (with_scalar != 0)
{
one_blob_only = true;
support_inplace = true;
}
return 0;
}
此op共有三个参数:
- op_type:计算类型,也就是上面枚举中定义的那些。
- with_scalar:是否是标量。
- b:标量值。
前面头文件可以看到此op有两个推理的重载,一个是矩阵与矩阵的二元运算,一个是矩阵与标量的二元运算。由于运算类型很多,这里ncnn用了函数模板。我们先来看矩阵与标量的运算:
template<typename Op>
static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
{
//操作,这是一个类,这里面用了一个operator()重载所以,op就是操作函数
Op op;
int w = a.w;
int h = a.h;
int channels = a.c;
int size = w * h;
//openmp指令,用于多线程
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = a.channel(q);
//对每个通道的所有元素执行与标量b的二元运算
for (int i = 0; i < size; i++)
{
ptr[i] = op(ptr[i], b);
}
}
return 0;
}
矩阵与矩阵的二元运算函数过长,这里面只贴出关键的计算,这里面涉及到一个知识点,就是矩阵和矩阵之间运算需要注意Broadcasting机制。这里推荐一篇别人的文章,也是简书的: 介绍Broadcasting的。
ncnn代码这里up主写了个注释:
broadcasting rule:
https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting
上面链接里内容如下:
broadcasting rule
ncnn BinaryOp accepts blobs with different shape
C = BinaryOp(A, B)
shape notation convention is [w], [w,h], [w,h,c]
type | A | B | C |
---|---|---|---|
1 | [1] | scalar | [1] |
2 | [1] | [2] | [2] |
3 | [1] | [2,3] | [2,3] |
4 | [1] | [2,3,4] | [2,3,4] |
5 | [2] | scalar | [2] |
6 | [2] | [1] | [2] |
7 | [2] | [2] | [2] |
8 | [3] | [2,3] | [2,3] |
9 | [4] | [2,3,4] | [2,3,4] |
10 | [2,3] | scalar | [2,3] |
11 | [2,3] | [1] | [2,3] |
12 | [2,3] | [3] | [2,3] |
13 | [2,3] | [2,3] | [2,3] |
14 | [3,4] | [2,3,4] | [2,3,4] |
15 | [2,3,4] | scalar | [2,3,4] |
16 | [2,3,4] | [1] | [2,3,4] |
17 | [2,3,4] | [4] | [2,3,4] |
18 | [2,3,4] | [3,4] | [2,3,4] |
19 | [2,3,4] | [2,3,4] | [2,3,4] |
some special broadcasting rule exists for model compatibility
special type | A | B | C |
---|---|---|---|
1 | [2,3,4] | [1,1,4] | [2,3,4] |
2 | [2,3,4] | [2,3,1] | [2,3,4] |
3 | [1,1,4] | [2,3,4] | [2,3,4] |
4 | [2,3,1] | [2,3,4] | [2,3,4] |
程序大体上结构是基于if...else...的,底层两个blob作为输入,顶层一个blob作为输出。先来看看a为三维矩阵,b也为三维矩阵时的计算(对于计算过程的分析全部写在注释里):
if (w1 == 1 && h1 == 1 && channels1 == channels)
{
// special type 1
//这里面注释是特殊类型1,从表中可以看出矩阵B的wh都是1
//创造top blob矩阵c,形状和a一致
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;
//openmp指令,用于多线程
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
//矩阵b实际上只有4个元素,分布在四个通道上,这里把b和a的形状补全成一样的,这样每个通道上各自的矩形内所有元素都相等。
const float* b0 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size; i++)
{
outptr[i] = op(ptr[i], b0[0]);
}
}
return 0;
}
if (w1 == w && h1 == h && channels1 == 1)
{
// special type 2
//这里面注释是特殊类型1,从表中可以看出矩阵B的通道数为1
//创造top blob矩阵c,形状和a一致
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;
//openmp指令,用于多线程
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
//这里通道数为1,意味着必须沿着2号轴就行扩展,使b的形状与a一致,讲矩形内元素复制几份
const float* ptr1 = b;
float* outptr = c.channel(q);
for (int i = 0; i < size; i++)
{
outptr[i] = op(ptr[i], ptr1[i]);
}
}
return 0;
}
//下面这两种特殊情况与上面两种正好反过来,这里就不详细介绍了
if (w == 1 && h == 1 && channels1 == channels)
{
// special type 3
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* a0 = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
outptr[i] = op(a0[0], ptr1[i]);
}
}
return 0;
}
if (w1 == w && h1 == h && channels == 1)
{
// special type 4
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a;
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
outptr[i] = op(ptr[i], ptr1[i]);
}
}
return 0;
}
// type 19
// 这个是大表格里的类型19,形状相等的矩阵进行计算,这个很简单,就不详细说了,需要注意的是我们这里的乘除法不是矩阵乘除法,仅仅是两个矩阵对应元素的乘除。
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size; i++)
{
outptr[i] = op(ptr[i], ptr1[i]);
}
}
return 0;
}
写到这里,我想下面应该没有写下去的必要了,就是根据上面表格里的broadcasting rule去计算两个矩阵的运算。
pr内容:
binaryop
This operation is used for binary computation, and the calculation rule depends on the broadcasting rule. broadcasting rule
C = BinaryOp(A, B)
if with_scalar = 1:
- one_blob_only
- support_inplace
param id | name | type | default | description |
---|---|---|---|---|
0 | op_type | int | 0 | Operation type |
1 | with_scalar | int | 0 | 1=B is a scalar, 0=B is a matrix |
2 | b | float | 0.f | When B is a scalar, B = b |