Bert 技术详解 embedding
pytorch_pretrained_bert/modeling.py
BertEmbeding分析
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings. """
def __init__(self, config):
super(BertEmbeddings, self).__init__()
# 定义word 编码 word embedding
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
# 定义位置编码 position_embeddings
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# 定义 token编码 token_type_embeddings,用于输入有两句,区分两个句子的情况,第一句的编码全为0,第二句的编码全为1
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
# LayerNorm 层
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
# dropout比例
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 前向运算,输入为input_ids [batch_size, seq_len], 每个位置为词典中此词的index, 此处的seq_len为max_seq_len,如配置中的最大句子长度为300。此处的input_ids [batch_size, 300]
def forward(self, input_ids, token_type_ids=None):
# 获取句子长度(300)
seq_length = input_ids.size(1)
# 根据句子长度300生成 position_ids, 长度为[seq_len], 数值为[0,1,2,3...299]
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
# position_ids的维度扩展为和input_ids的维度相同2维,即由[seq_len]--> [batch_size, seq_len],即有batch_size个[0,1,2,3...299]
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
# token_type_ids 值全为0,维度和input_ids相同,为[batch_size, seq_len]
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
# input_ids embedding
words_embeddings = self.word_embeddings(input_ids)
# position_ids embedding
position_embeddings = self.position_embeddings(position_ids)
# token embedding
token_type_embeddings = self.token_type_embeddings(token_type_ids)
# 三个embedding 进行相加
embeddings = words_embeddings + position_embeddings + token_type_embeddings
# LayerNorm 处理
embeddings = self.LayerNorm(embeddings)
# dropout
embeddings = self.dropout(embeddings)
return embeddings
nn.Embedding(torch\nn\modules\sparse.py)
其中最重要的工作是nn.Embedding这个函数实现的,这个函数中的注释中的例子给的非常好,让读者一目了然。我们看下这个函数的代码:
class Embedding(Module):
r"""A simple lookup table that stores embeddings of a fixed dictionary and size. This module is often used to store word embeddings and retrieve them using indices. The input to the module is a list of indices, and the output is the corresponding word embeddings.
Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx` (initialized to zeros) whenever it encounters the index. max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm` is renormalized to have norm :attr:`max_norm`. norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``. scale_grad_by_freq (boolean, optional): If given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default ``False``. sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Attributes: weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim) initialized from :math:`\mathcal{N}(0, 1)` Shape: - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}` .. note:: Keep in mind that only a limited number of optimizers support sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`), :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`) .. note:: With :attr:`padding_idx` set, the embedding vector at :attr:`padding_idx` is initialized to all zeros. However, note that this vector can be modified afterwards, e.g., using a customized initialization method, and thus changing the vector used to pad the output. The gradient for this vector from :class:`~torch.nn.Embedding` is always zero.
Examples::
>>> # an Embedding module containing 10 tensors of size 3
>>> embedding = nn.Embedding(10, 3)
>>> # a batch of 2 samples of 4 indices each
>>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
>>> embedding(input)
tensor([[[-0.0251, -1.6902, 0.7172], [-0.6431, 0.0748, 0.6969], [ 1.4970, 1.3448, -0.9685], [-0.3677, -2.7265, -0.1685]], [[ 1.4970, 1.3448, -0.9685], [ 0.4362, -0.4004, 0.9400], [-0.6431, 0.0748, 0.6969], [ 0.9124, -2.3616, 1.1151]]])
>>> # example with padding_idx
>>> embedding = nn.Embedding(10, 3, padding_idx=0)
>>> input = torch.LongTensor([[0,2,0,5]])
>>> embedding(input)
tensor([[[ 0.0000, 0.0000, 0.0000], [ 0.1535, -2.0309, 0.9315], [ 0.0000, 0.0000, 0.0000], [-0.1655, 0.9897, 0.0635]]]) """
__constants__ = ['num_embeddings', 'embedding_dim', 'max_norm', 'norm_type',
'scale_grad_by_freq', 'mode', 'sparse', 'include_last_offset']
num_embeddings: int
embedding_dim: int
padding_idx: int
max_norm: float
norm_type: float
scale_grad_by_freq: bool
weight: Tensor
sparse: bool
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
sparse: bool = False, _weight: Optional[Tensor] = None) -> None:
super(Embedding, self).__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
if padding_idx is not None:
if padding_idx > 0:
assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings'
elif padding_idx < 0:
assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings'
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
self.max_norm = max_norm
self.norm_type = norm_type
self.scale_grad_by_freq = scale_grad_by_freq
if _weight is None:
# weight 维度[num_embeddings, embedding_dim]
self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
self.reset_parameters()
else:
assert list(_weight.shape) == [num_embeddings, embedding_dim], \
'Shape of weight does not match num_embeddings and embedding_dim'
self.weight = Parameter(_weight)
self.sparse = sparse
def reset_parameters(self) -> None:
init.normal_(self.weight)
if self.padding_idx is not None:
with torch.no_grad():
self.weight[self.padding_idx].fill_(0)
def forward(self, input: Tensor) -> Tensor:
# 处理函数在这里
# input维度[batch_size, max_seq_len], 输出维度为[batch_size, max_seq_len, embedding_dim]。即input中每个位置用一个数字表示,在output中,此位置的数值被embeding为一个维度为[embedding_dim]的vector。embedding_dim的数值为config.hidden_size
# self.weight的维度[num_embeddings, embedding_dim],
#
# word embedding时, 其中位置的数值范围在[0, voc_size]内, num_embeddings=voc_size
# postion embedding时, 每个位置的数值范围[0,config.max_position_embeddings], num_embeddings=config.max_position_embeddings
# token embedding时, 每个位置的数值范围[0,config.type_vocab_size],num_embeddings=config.type_vocab_size,config.type_vocab_size值常设置为2,表示输入最多有两个句子。
return F.embedding(
input, self.weight, self.padding_idx, self.max_norm,
self.norm_type, self.scale_grad_by_freq, self.sparse)
def extra_repr(self) -> str:
s = '{num_embeddings}, {embedding_dim}'
if self.padding_idx is not None:
s += ', padding_idx={padding_idx}'
if self.max_norm is not None:
s += ', max_norm={max_norm}'
if self.norm_type != 2:
s += ', norm_type={norm_type}'
if self.scale_grad_by_freq is not False:
s += ', scale_grad_by_freq={scale_grad_by_freq}'
if self.sparse is not False:
s += ', sparse=True'
return s.format(**self.__dict__)
@classmethod
def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
max_norm=None, norm_type=2., scale_grad_by_freq=False,
sparse=False):
r"""Creates Embedding instance from given 2-dimensional FloatTensor.
Args:
embeddings (Tensor): FloatTensor containing weights for the Embedding.
First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``.
freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True``
padding_idx (int, optional): See module initialization documentation.
max_norm (float, optional): See module initialization documentation.
norm_type (float, optional): See module initialization documentation. Default ``2``.
scale_grad_by_freq (boolean, optional): See module initialization documentation. Default ``False``.
sparse (bool, optional): See module initialization documentation.
Examples::
>>> # FloatTensor containing pretrained weights
>>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
>>> embedding = nn.Embedding.from_pretrained(weight)
>>> # Get embeddings for index 1
>>> input = torch.LongTensor([1])
>>> embedding(input)
tensor([[ 4.0000, 5.1000, 6.3000]])
"""
assert embeddings.dim() == 2, \
'Embeddings parameter is expected to be 2-dimensional'
rows, cols = embeddings.shape
embedding = cls(
num_embeddings=rows,
embedding_dim=cols,
_weight=embeddings,
padding_idx=padding_idx,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
sparse=sparse)
embedding.weight.requires_grad = not freeze
return embedding
pytorch版本的embedding嵌套的比较深,我们可以看看tensorflow版本的embedding的实现
def embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): """Looks up words embeddings for id tensor.
Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary.
embedding_size: int. Width of the word embeddings.
initializer_range: float. Embedding initialization range.
word_embedding_name: string. Name of the embedding table.
use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better for TPUs.
Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. """
# This function assumes that the input is of shape [batch_size, seq_length,
# num_inputs].
#
# If the input is a 2D tensor of shape [batch_size, seq_length], we
# reshape to [batch_size, seq_length, 1].
if input_ids.shape.ndims == 2:
input_ids = tf.expand_dims(input_ids, axis=[-1])
# embedding_table [vocab_size, embedding_size]
embedding_table = tf.get_variable(
name=word_embedding_name,
shape=[vocab_size, embedding_size],
initializer=create_initializer(initializer_range))
if use_one_hot_embeddings:
# input_ids维度为[batch_size, seq_length], reshape之后flat_input_ids维度为1, 即[batch_size * seq_length]
flat_input_ids = tf.reshape(input_ids, [-1])
# flat_input_ids 中的每个元素,转换为one-hot表示,即第元素对应的数字处位置为1,其余位置为0,one-hot的向量大小为vocab_size。one_hot_input_ids 维度为[batch_size*seq_len, vocab_size]
# 如flat_input_ids: [1, 3], vocab_size:4,
# 则 [1, 3]对应的 one_hot_input_ids为 [[0, 1, 0, 0], [0, 0, 0, 1]],shape为(2,4)即[batch_size * seq_length, vocab_size]
one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
# 用 embedding_table 对 one_hot_input_ids 做矩阵变换
# one_hot_input_ids 矩阵为 [batch_size*seq_len, vocab_size], embedding_table 矩阵为[vocab_size, embedding_size], output矩阵为[[batch_size*seq_len, embedding_size]
output = tf.matmul(one_hot_input_ids, embedding_table)
else:
# 从embedding_table中,查找flat_input_ids,对应的行,即为batch_size*seq_len行,每行中数据元素的个数为embedding_size个
output = tf.gather(embedding_table, flat_input_ids)
input_shape = get_shape_list(input_ids)
output = tf.reshape(output,
input_shape[0:-1] + [input_shape[-1] * embedding_size])
return (output, embedding_table)
LayerNorm
归一化层,目前主要有这几个方法,Batch Normalization(2015年)、Layer Normalization(2016年)、Instance Normalization(2017年)、Group Normalization(2018年)、Switchable Normalization(2018年);
做过图像处理的同学对batchnorm比较熟悉,yolov2在yolov1的基础上加上了BN算法,性能得到了很多啊的提升。在进行训练之前,一般要对数据做归一化,使其分布一致,但是在深度神经网络训练过程中,通常以送入网络的每一个batch训练,这样每个batch具有不同的分布,batch normalization就是强行将数据拉回到均值为0,方差为1的正太分布上,这样不仅数据分布一致,而且避免发生梯度消失。
RNN中一个batch由多条句子组成,每个句子的长度大小是不一致的。不像图像的长宽是确定的。
layerNorm的源码位置为:pytorch_pretrained_bert/modeling.py BertLayerNorm
class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root). """
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps
def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
dropout的作用
Dropout可以作为训练深度神经网络的一种trick供选择。在每个训练批次中,通过让一部分隐层节点值为0,可以明显地减少过拟合现象。这种方式可以减少特征检测器(隐层节点)间的相互作用,检测器相互作用是指某些检测器依赖其他检测器才能发挥作用。
是通过nn.Dropout(config.hidden_dropout_prob)实现 ,默认config.hidden_dropout_prob的概率是0.1
举个dropout应用的例子
m = torch.nn.Dropout(p=0.5)
input = torch.randn(3, 4)
output = m(input)
print(input)
print(output)
输出为:
tensor([[-0.5077, -0.3724, -0.6053, 0.7276],
[ 2.3065, -1.1119, 0.1846, -0.8296],
[ 1.2716, 0.4014, 0.2034, 0.8699]])
tensor([[-1.0153, -0.7447, -1.2105, 0.0000],
[ 4.6130, -0.0000, 0.0000, -1.6592],
[ 2.5433, 0.8027, 0.0000, 1.7397]])
参考:
感谢以下几位博客作者的文章指导~
https://blog.csdn.net/liuxiao214/article/details/81037416
https://blog.csdn.net/program_developer/article/details/80737724