-
torch.nn.``CrossEntropyLoss
(weight: Optional[torch.Tensor] = None, size_average=None, ignore_index: int = -100, reduce=None, reduction: str = 'mean')The input is expected to contain raw, unnormalized scores for each class. 不需要去做softmax,直接将分数输入
用于单标签分类
input:(N,C),target:(C)
-
多标签分类
torch.nn.``MultiLabelSoftMarginLoss
(weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean')
-
Field(supar)
定义一个域来处理数据集中的一个列。
比如怎么把一句话的这一列转换为一个tensor;还有要不要这一列要不要建词典,用于之后数字化
supar中比如:
WORD = Field('words', pad=pad, unk=unk, lower=True) FEAT = SubwordField('chars', pad=pad, unk=unk, fix_len=args.fix_len) FEAT = SubwordField('bert',pad=tokenizer.pad_token,unk=tokenizer.unk_token,fix_len=args.fix_len,tokenize=tokenizer.tokenize) EDGE = ChartField('edges', use_vocab=False, fn=CoNLL.get_edges) LABEL = ChartField('labels', fn=CoNLL.get_labels) # field.build(dataset):从dataset建立一个词典 FEAT.build(train)
chartfield:
-
Transform(supar)
transform是由一些field组成的类,这些field对数据中的对应的列进行处理,针对不同结构的数据,组成不同的transform。
比如CoNLL是transform的一个子类,有十列fields = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
transform = CoNLL(FORM=(WORD, FEAT), PHEAD=(EDGE, LABEL)) transform = CoNLL(FORM=WORD, CPOS=FEAT, PHEAD=(EDGE, LABEL)) transform.load(path) # 读取数据集,返回[conll_sentence],每个conll_sentence包含代表一句话, # conll_sentence有一个values=[(),(),...],每一个()包含一列的所有内容
-
Dataset(supar)
加载数据和数字化、生成dataloader(以此生成batch)都在这里面
train = Dataset(transform, args.train) # 第二个参数是数据的路径
-
x = torch.rand(2,3,5)
返回tensor的size为[2,3,5]
-
torch.ones_like(x)
x = torch.rand(5, 10, 600) y = torch.ones_like(x[..., :1]) z = torch.ones_like(x) print(z.shape) print(y.shape) >> torch.Size([5, 10, 600]) torch.Size([5, 10, 1]) torch.ones(5, 10, 600)
def forward(self, x, y):
r"""
Args:
x (torch.Tensor): ``[batch_size, seq_len, n_in]``.
y (torch.Tensor): ``[batch_size, seq_len, n_in]``.
Returns:
~torch.Tensor:
A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len]``.
If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically.
"""
if self.bias_x:
x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
if self.bias_y:
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
# [batch_size, n_out, seq_len, seq_len]
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
# remove dim 1 if n_out == 1
s = s.squeeze(1)
return s
-
把pad的mask掉
x = torch.tensor([[2, 1, 432, 52, 0], [2, 5, 234, 0, 0]]) # [2,5] mask = x.ne(0) # [2,5] >>mask tensor([[ True, True, True, True, False], [ True, True, True, False, False]]) mask1 = mask.unsqueeze(1) #[2,1,5] mask2 = mask.unsqueeze(2) #[2,5,1] >>mask1 tensor([[[ True, True, True, True, False]], [[ True, True, True, False, False]]]) >>mask2 tensor([[[ True], [ True], [ True], [ True], [False]], [[ True], [ True], [ True], [False], [False]]]) mask3 = mask2 & mask1 #[2,5,5] 五行五列 >>mask3 tensor([[[ True, True, True, True, False], [ True, True, True, True, False], [ True, True, True, True, False], [ True, True, True, True, False], [False, False, False, False, False]], [[ True, True, True, False, False], [ True, True, True, False, False], [ True, True, True, False, False], [False, False, False, False, False], [False, False, False, False, False]]])
-
把每句话的第一个词 bos mask掉
>>mask tensor([[[ True, True, True, True, False], [ True, True, True, True, False], [ True, True, True, True, False], [ True, True, True, True, False], [False, False, False, False, False]], [[ True, True, True, False, False], [ True, True, True, False, False], [ True, True, True, False, False], [False, False, False, False, False], [False, False, False, False, False]]]) mask[:, 0] = 0 #“:”对第0维度的所有进行操作,[:, 0]对所有二维矩阵的第0行赋值0 >>mask tensor([[[False, False, False, False, False], [ True, True, True, True, False], [ True, True, True, True, False], [ True, True, True, True, False], [False, False, False, False, False]], [[False, False, False, False, False], [ True, True, True, False, False], [ True, True, True, False, False], [False, False, False, False, False], [False, False, False, False, False]]]) >>print(mask[1, 1]) tensor([ True, True, True, False, False])
-
set the indices larger than num_embeddings to unk_index
if hasattr(self, 'pretrained'): ext_mask = words.ge(self.word_embed.num_embeddings) # 生成一个mask,size与words相同,在words中大于self.word_embed.num_embeddings的地方,对应mask中位置为True,别的位置为False ext_words = words.masked_fill(ext_mask, self.unk_index) # words不变,根据ext_mask来将位置为True的位置设置为self.unk_index,返回新的ext_words
-
pack_padded_sequence, pad_packed_sequence(待研究)
x = pack_padded_sequence(embed, mask.sum(1), True, False)
x, _ = self.lstm(x)
x, _ = pad_packed_sequence(x, True, total_length=seq_len)
x = self.lstm_dropout(x)
-
根据mask去取值
>>mask.shape [145,22,22] >>s_egde.shape [145,22,22,2] >>egdes.shape [145,22,22] x = s_egde[mask] # x:[k,2] y = edges[mask] # y:[k] k是mask中为True的数量
-
supar的Metric多看看
def __call__(self, preds, golds): pred_mask = preds.ge(0) gold_mask = golds.ge(0) span_mask = pred_mask & gold_mask self.pred += pred_mask.sum().item() self.gold += gold_mask.sum().item() self.tp += (preds.eq(golds) & span_mask).sum().item() self.utp += span_mask.sum().item() return self
-
embed = nn.Embedding(num_embeddings=3, embedding_dim=10) trans = torch.randint(0, 3, (4, 3)) x = embed(trans) print(x.shape) #[4, 3, 10]
-
x = torch.ones(1, 2, 3, 10) x = torch.sum(x, dim=2) print(x.shape) >>[1, 2, 10]
-
y = torch.ones(1, 1, 10) y_ = y.expand(2, -1, -1) # 表示第0维变成2,别的是-1,表示不变(复制)
-
torch.``gather
(input, dim, index, ***, sparse_grad=False, out=None) → Tensor- input (Tensor) – the source tensor
- dim (int) – the axis along which to index
- index (LongTensor) – the indices of elements to gather
- sparse_grad (bool*,*optional) – If
True
, gradient w.r.t.input
will be a sparse tensor. - out (Tensor, optional) – the destination tensor
x = torch.cat([x, null_hidden], dim=1) # x:[batch_size, seq_len+1, hidden_size*2] idx = torch.randint(0, seq_len+1, (batch_size, transition_num, id_num)) # idx:[batch_size, transition_num, id_num] # 我希望得到的是[batch_size, transition_num, id_num, hidden_size*2], 也就是说,batch对应到batch,用id去抽取 idx = idx.view(batch_size, transition_num*id_num).unsqueeze(-1).expand(-1,-1, hidden_size*2) states_hidden = torch.gather(x, 1, idx) # [batch_size, transition_num*d_num, hidden_size*2] states_hidden = states_hidden.view(batch_size, transition_num, id_num,-1)
idx的维度的数量必须和输入源的维度数量一致,最终的输出size和idx的size是一样的。
值得多看
-
clip gradient(待研究)
nn.utils.clip_grad_norm_(self.model.parameters(), self.args.clip)
-
切片
这些切片实际上并没有新建一个内存块出来,只是一个视图,按步长去跳给你看,实际上还是就只存了那一块地方的东西,你改了切出来的东西,源头也变了
states = torch.ones(2, 8, 4, 4) * (-1) windowed = states[..., :2] win_states = windowed[:, :, 0:3, :]
-
masked_fill
null_lstm_mask = win_states.eq(-1) win_states.masked_fill_(null_lstm_mask, 10) # !!!这个就地操作改了states,也改了windowed,如果之后还要用到就有可能会发生错误, # 不能使用就地操作,应该: win_states = win_states.masked_fill(null_lstm_mask, 10)
-
以后建议都不要轻易用就地操作,即使你觉得没关系,不需要梯度,改了也没事的地方,不到万不得已不用就地。
>>> x
tensor([[[0.3841, 0.1389, 0.3260, 0.4158, 0.6389, 0.6976, 0.0637, 0.5739],
[0.5719, 0.2547, 0.5951, 0.6418, 0.4269, 0.1382, 0.8245, 0.2387],
[0.8329, 0.2095, 0.5549, 0.8926, 0.9351, 0.3968, 0.9392, 0.7918]],
[[0.0741, 0.9406, 0.4003, 0.0500, 0.9153, 0.6366, 0.2182, 0.9227],
[0.3717, 0.5527, 0.1052, 0.8658, 0.6832, 0.6488, 0.4907, 0.9008],
[0.0440, 0.6651, 0.7734, 0.7687, 0.2805, 0.5114, 0.3318, 0.6839]]])
>>> x.shape
torch.Size([2, 3, 8])
>>> batch_idx
tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
>>> word_idx
tensor([0, 1, 1, 2, 2, 0, 1, 1, 2, 2])
>>> x[batch_idx,word_idx]
tensor([[0.3841, 0.1389, 0.3260, 0.4158, 0.6389, 0.6976, 0.0637, 0.5739],
[0.5719, 0.2547, 0.5951, 0.6418, 0.4269, 0.1382, 0.8245, 0.2387],
[0.5719, 0.2547, 0.5951, 0.6418, 0.4269, 0.1382, 0.8245, 0.2387],
[0.8329, 0.2095, 0.5549, 0.8926, 0.9351, 0.3968, 0.9392, 0.7918],
[0.8329, 0.2095, 0.5549, 0.8926, 0.9351, 0.3968, 0.9392, 0.7918],
[0.0741, 0.9406, 0.4003, 0.0500, 0.9153, 0.6366, 0.2182, 0.9227],
[0.3717, 0.5527, 0.1052, 0.8658, 0.6832, 0.6488, 0.4907, 0.9008],
[0.3717, 0.5527, 0.1052, 0.8658, 0.6832, 0.6488, 0.4907, 0.9008],
[0.0440, 0.6651, 0.7734, 0.7687, 0.2805, 0.5114, 0.3318, 0.6839],
[0.0440, 0.6651, 0.7734, 0.7687, 0.2805, 0.5114, 0.3318, 0.6839]])
>>> lr_action
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
>>> arc
tensor([0, 2])
# 我需要把第0行的第0列,第1行的第2列改成1
>>> lr_action[torch.range(0,1,dtype=torch.long),arc]=1
>>> lr_action
tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])
- 先mask.eq(value)
torch.nonzero(source==target)
用来将原来的tensor按照维度dim拆分成由小tensor组成的tuple
>>> a=torch.ones((2,6,6)).bool()
>>> a.triu()
tensor([[[ True, True, True, True, True, True],
[False, True, True, True, True, True],
[False, False, True, True, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, False, False, True]],
[[ True, True, True, True, True, True],
[False, True, True, True, True, True],
[False, False, True, True, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, False, False, True]]])
>>> a.triu(1)
tensor([[[False, True, True, True, True, True],
[False, False, True, True, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, False, False, True],
[False, False, False, False, False, False]],
[[False, True, True, True, True, True],
[False, False, True, True, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, False, False, True],
[False, False, False, False, False, False]]])
>>> b=a.tril(3)
>>> b
tensor([[[ True, True, True, True, False, False],
[ True, True, True, True, True, False],
[ True, True, True, True, True, True],
[ True, True, True, True, True, True],
[ True, True, True, True, True, True],
[ True, True, True, True, True, True]],
[[ True, True, True, True, False, False],
[ True, True, True, True, True, False],
[ True, True, True, True, True, True],
[ True, True, True, True, True, True],
[ True, True, True, True, True, True],
[ True, True, True, True, True, True]]])
>>> a&b
tensor([[[ True, True, True, True, False, False],
[False, True, True, True, True, False],
[False, False, True, True, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, False, False, True]],
[[ True, True, True, True, False, False],
[False, True, True, True, True, False],
[False, False, True, True, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, False, False, True]]])
- torch.topk(input, k, dim)
-
首先加载对应模型的tokenizer
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') >>> encoded_input = tokenizer("Hello, I'm a single sentence!") >>> print(encoded_input) {'input_ids': [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} >>> tokenizer.decode(encoded_input["input_ids"]) "[CLS] Hello, I'm a single sentence! [SEP]" # tokenizer是直接对一整个句子进行tokenize并返回id(会加上cls和sep),如果需要对单个词进行tokenize推荐下面这种:先tokenize,再id >>> t('Hello-you') ['Hello', '-', 'you'] >>> [dic[token] for token in t('Hello-you')] [8667, 118, 1128] # 一次性处理多个句子,但是发现 >>> batch_sentences = ["Hello I'm a single sentence", ... "And another sentence", ... "And the very very last one"] >>> encoded_inputs = tokenizer(batch_sentences) >>> print(encoded_inputs) {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102], [101, 1262, 1330, 5650, 102], [101, 1262, 1103, 1304, 1304, 1314, 1141, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]} # 一个batch返回相同长度的句子,且是tensor >>> batch = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") >>> print(batch) {'input_ids': tensor([[ 101, 8667, 146, 112, 182, 170, 1423, 5650, 102], [ 101, 1262, 1330, 5650, 102, 0, 0, 0, 0], [ 101, 1262, 1103, 1304, 1304, 1314, 1141, 102, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0]])} # attention_mask 表明了哪些词是pad的 # 处理一对句子:https://huggingface.co/transformers/preprocessing.html
-
然后是加载模型
def __init__(self, model, n_layers, n_out=0, stride=256, pooling='mean', pad_index=0, dropout=0, requires_grad=False):
super().__init__()
from transformers import AutoConfig, AutoModel, AutoTokenizer
# output_hidden_states=True,返回每一层的隐藏状态.加载模型主要是下面这两句
self.bert = AutoModel.from_pretrained(model, config=AutoConfig.from_pretrained(model, output_hidden_states=True))
self.bert = self.bert.requires_grad_(requires_grad)
self.model = model
self.n_layers = n_layers or self.bert.config.num_hidden_layers
self.hidden_size = self.bert.config.hidden_size
self.n_out = n_out or self.hidden_size
self.stride = stride
self.pooling = pooling
self.pad_index = pad_index
self.dropout = dropout
self.requires_grad = requires_grad
self.max_len = int(max(0, self.bert.config.max_position_embeddings) or 1e12) - 2
self.tokenizer = AutoTokenizer.from_pretrained(model)
self.scalar_mix = ScalarMix(self.n_layers, dropout)
self.projection = nn.Linear(self.hidden_size, self.n_out, False) if self.hidden_size != n_out else nn.Identity()
def forward(self, subwords):
r"""
Args:
subwords (~torch.Tensor): ``[batch_size, seq_len, fix_len]``.
Returns:
~torch.Tensor:
BERT embeddings of shape ``[batch_size, seq_len, n_out]``.
"""
mask = subwords.ne(self.pad_index)
lens = mask.sum((1, 2))
# [batch_size, n_subwords], 下面的两步可以直接把词级别的变为句子级别,参考上一点(说明bert的输入还是按照一句话输入的)
subwords = pad(subwords[mask].split(lens.tolist()), self.pad_index, padding_side=self.tokenizer.padding_side)
bert_mask = pad(mask[mask].split(lens.tolist()), 0, padding_side=self.tokenizer.padding_side)
# return the hidden states of all layers
bert = self.bert(subwords[:, :self.max_len], attention_mask=bert_mask[:, :self.max_len].float())[-1]
# [n_layers, batch_size, max_len, hidden_size]
bert = bert[-self.n_layers:]
# [batch_size, max_len, hidden_size]
bert = self.scalar_mix(bert)
# [batch_size, n_subwords, hidden_size], 这一步就是把长的句子当成多个句子编码
for i in range(self.stride, (subwords.shape[1]-self.max_len+self.stride-1)//self.stride*self.stride+1, self.stride):
part = self.bert(subwords[:, i:i+self.max_len], attention_mask=bert_mask[:, i:i+self.max_len].float())[-1]
bert = torch.cat((bert, self.scalar_mix(part[-self.n_layers:])[:, self.max_len-self.stride:]), 1)
# [batch_size, seq_len]
bert_lens = mask.sum(-1)
bert_lens = bert_lens.masked_fill_(bert_lens.eq(0), 1)
# [batch_size, seq_len, fix_len, hidden_size]
embed = bert.new_zeros(*mask.shape, self.hidden_size).masked_scatter_(mask.unsqueeze(-1), bert[bert_mask])
# [batch_size, seq_len, hidden_size]
if self.pooling == 'first':
embed = embed[:, :, 0]
elif self.pooling == 'last':
embed = embed.gather(2, (bert_lens-1).unsqueeze(-1).repeat(1, 1, self.hidden_size).unsqueeze(2)).squeeze(2)
else:
embed = embed.sum(2) / bert_lens.unsqueeze(-1)
embed = self.projection(embed)
return embed
elmo现在用的比较少,这里仅记录了最简单的用法
-
首先是elmo的tokenize (直接对词进行操作,没有subword)
from allennlp.modules.elmo import batch_to_ids class ElmoField(Field): def build(self, dataset, min_freq=1, embed=None): return def transform(self, sequences): # sequences = [ # ['<bos>'] + list(sequence) for sequence in sequences # ] if self.lower: sequences = [ list(map(str.lower, sequence)) for sequence in sequences ] return sequences def compose(self, sequences): # Composes a batch of sequences into a padded tensor. return batch_to_ids(sequences).to(self.device)
-
然后是模型
class Elmo(allennlp.modules.elmo.Elmo): def __init__(self, layer=3, dropout=0.33, if_requires_grad=False): """ Args: layer (int): dropout (float): """ self.scalar_mix_parameters = [] for i in range(layer): tmp_lst = [-9e10, -9e10, -9e10] tmp_lst[i] = 1.0 self.scalar_mix_parameters.append(tmp_lst) super(Elmo, self).__init__(options_file="data/ELMO/options.json", weight_file="data/ELMO/weights.hdf5", num_output_representations=layer, requires_grad=if_requires_grad, keep_sentence_boundaries=True, scalar_mix_parameters=self.scalar_mix_parameters, dropout=dropout) self.n_layers = layer self.dropout_rate = dropout self.if_requires_grad = if_requires_grad self.gamma = nn.Parameter(torch.FloatTensor([1.0])) self.softmax_weights = nn.ParameterList([nn.Parameter(torch.FloatTensor([0.0])) for _ in range(layer)]) # self.reset_parameters() # def reset_parameters(self): # nn.init.normal_(self.softmax.weight, 0.0, 0.01) # softmax layer def forward(self, chars, word_inputs=None): """ Args: chars: word_inputs: Returns: """ normed_weights = F.softmax(torch.cat([param for param in self.softmax_weights]), dim=0) normed_weights = torch.split(normed_weights, 1) # pdb.set_trace() res = super(Elmo, self).forward(chars)['elmo_representations'] final = normed_weights[0] * res[0] for i in range(1, self.n_layers): final += (normed_weights[i] * res[i]) final = self.gamma * final return final[:, :-1]
>>> a tensor([[1, 2], [3, 4], [5, 6]]) >>> c tensor([[1, 2], [5, 6]]) values, indices = torch.topk(((a.t() == c.unsqueeze(-1)).all(dim=1)).int(), 1, 1) indices = indices[values!=0] >>> indices tensor([0, 2])