GRU的类模板参数与对象参数与RNN一致
import torch
from torch import nn
from torch.nn import functional as F
seq_len = 85
batch_size = 32
embedding_dim = 256
output_size = 512
bidirectional=True
num_layers = 2
single_rnn_nums = num_layers
model = nn.GRU(input_size=256, hidden_size=output_size, num_layers=num_layers, bidirectional=bidirectional)
x = torch.randn([seq_len, batch_size, embedding_dim])
# 每一个单元处理,都有一个隐藏层
# 输出层output包含所有的隐藏层,[seq_len,batch_size,output_size]
if bidirectional:
single_rnn_nums = num_layers*2
h0 = torch.zeros([single_rnn_nums, batch_size, output_size])
# ht是每一个时间步t的输出,hn是最后一个时间步的输出
output,hn = model(x,h0)
将RNN替换为GRU
class ParamConfig():
padding_idx = 0
embedding_dim = 256
hidden_size = 512
output_size = 2
bidirectional = True
num_layers=2
batch_size = 128
debug = False
BASE_DIR = sys.path[0]
param_path = os.path.join(BASE_DIR,"model/model_gru2.pkl")
log_file = os.path.join(BASE_DIR,"main.log")
def __init__(self, isTest=False, seq_len=seq_len) -> None:
if isTest:# 测试不需要加载真实数据,随机给个数,快速验证模型
self.dict_len = 10000
self.seq_len = seq_len
self.debug = True
else:
words_set,word2idx = load_hotel(return_dict=True)
dict_len = len(words_set)
print(f"dict_len:{dict_len}") # dict_len:21437
self.dict_len = dict_len
self.word2idx = word2idx
self.seq_len = seq_len
pm = ParamConfig()
class RNNClassify1(nn.Module):
def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug):
super(RNNClassify1, self).__init__()
self.embedding = nn.Embedding(num_embeddings=dict_len,
embedding_dim=input_size,
padding_idx=0)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.debug = debug
self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
if bidirectional:
self.single_rnn_nums = num_layers*2
else:
self.single_rnn_nums = num_layers
self.fc = nn.Linear(hidden_size, output_size)
def forward(self,X):
"""
- X:2维索引矩阵
-
"""
if X.ndim != 2:
throw("X must be 2-dimensional")
# 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同
batch_size = X.shape[0]
x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding]
if self.debug:
print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256])
x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding]
if self.debug:
print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256])
h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size)
# h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量
out, hn = self.model(x, h0)
#根据RNN思想,最后一个单词的输出包含整个序列的信息
#意思就是最后一个单词的输出,也是整个序列的输出
#即hn中一个长度为hidden_size的向量就是一个序列的上下文向量
#现有batch_size个序列,应该有batch_size个长度为hidden_size的上下文向量
#需要将hn的shape转化为[batch_size,hidden_size]
#对hn的dim=0维度进行sum,使该维消失,正好满足需求
#hn的dim=0维是指几条单向RNN链,最后将所有的单向RNN链最后一个单词的输出,融合到一起
#这里选择了相加,融合,还有一种做法,就是拼接,将多个单向RNN的输出结果拼接到一起
# out = torch.sum(input=hn, dim=0)
out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size]
out = torch.sum(input=out, dim=0)
#全连接分类
out = self.fc(out)
return out
欠拟合
训练集精度不够 此时,可尝试一些组件
过拟合
当训练集精度已经接近100%时,损失函数的值已经非常小了 但测试集上精度不再升 也就是说这个模型的能力已经全部展现了 这时候可能一些小的优化就不起作用了
#全连接分类 out = self.fc1(out) out = self.fc2(out) out = self.fc3(out) 相比 out = self.fc(out) 多层可以让模型有更快的收敛速度,但几乎不影响最终的精度 self.fc1 = nn.Linear(hidden_size, 4*hidden_size) self.fc2 = nn.Linear(4*hidden_size, hidden_size) self.fc3 = nn.Linear(hidden_size, output_size) 第二个影响是 单层全连接 在训练开始 在训练集上有较高的得分,测试集得分较低,二者差距明显 多层全连接 在训练开始 在训练集与测试集的得分差异不大,即欠拟合的情况不明显
多层全连接代码
class RNNClassify1(nn.Module):
def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug):
super(RNNClassify1, self).__init__()
self.embedding = nn.Embedding(num_embeddings=dict_len,
embedding_dim=input_size,
padding_idx=0)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.debug = debug
self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
if bidirectional:
self.single_rnn_nums = num_layers*2
else:
self.single_rnn_nums = num_layers
self.fc1 = nn.Linear(hidden_size, 4*hidden_size)
self.fc2 = nn.Linear(4*hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
def forward(self,X):
"""
- X:2维索引矩阵
-
"""
if X.ndim != 2:
throw("X must be 2-dimensional")
# 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同
batch_size = X.shape[0]
x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding]
if self.debug:
print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256])
x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding]
if self.debug:
print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256])
h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size)
# h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量
out, hn = self.model(x, h0)
#根据RNN思想,最后一个单词的输出包含整个序列的信息
#意思就是最后一个单词的输出,也是整个序列的输出
#即hn中一个长度为hidden_size的向量就是一个序列的上下文向量
#现有batch_size个序列,应该有batch_size个长度为hidden_size的上下文向量
#需要将hn的shape转化为[batch_size,hidden_size]
#对hn的dim=0维度进行sum,使该维消失,正好满足需求
#hn的dim=0维是指几条单向RNN链,最后将所有的单向RNN链最后一个单词的输出,融合到一起
#这里选择了相加,融合,还有一种做法,就是拼接,将多个单向RNN的输出结果拼接到一起
# out = torch.sum(input=hn, dim=0)
out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size]
out = torch.sum(input=out, dim=0)
#全连接分类
out = self.fc1(out)
out = self.fc2(out)
out = self.fc3(out)
return out
num_layers对RNN的影响
num_layers从2到5精度并没有增加,精度最高仍然到89%,在86%-88%之间震荡 RNN的层数的为2就已经将有用的信息提取完了, 更多的层数没有提取出更有用的信息
代码
class ParamConfig():
padding_idx = 0
embedding_dim = 256
hidden_size = 512
output_size = 2
bidirectional = True
num_layers=5
batch_size = 128
debug = False
BASE_DIR = sys.path[0]
param_path = os.path.join(BASE_DIR,"model/model_gru7_2.pkl")
log_file = os.path.join(BASE_DIR,"main.log")
def __init__(self, isTest=False, seq_len=seq_len) -> None:
if isTest:# 测试不需要加载真实数据,随机给个数,快速验证模型
self.dict_len = 10000
self.seq_len = seq_len
self.debug = True
else:
words_set,word2idx = load_hotel(return_dict=True)
dict_len = len(words_set)
print(f"dict_len:{dict_len}") # dict_len:21437
self.dict_len = dict_len
self.word2idx = word2idx
self.seq_len = seq_len
pm = ParamConfig()
# pm = ParamConfig(isTest=True)
class MyDataSet(Dataset):
"""
构建数据集
"""
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
x = self.X[idx]
y = self.y[idx]
return torch.tensor(data=x).long(), torch.tensor(data=y).long()
train_dataset = MyDataSet(X=X_train, y=y_train)
test_dataset = MyDataSet(X=X_test,y=y_test)
# 从数据集中批次取数据
train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size = pm.batch_size)
for X,y in train_dataloader:
print(X.shape,X.ndim,y.shape,y.ndim) # torch.Size([128, 85]) 2 torch.Size([128]) 1
break
class RNNClassify1(nn.Module):
def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug):
super(RNNClassify1, self).__init__()
self.embedding = nn.Embedding(num_embeddings=dict_len,
embedding_dim=input_size,
padding_idx=0)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.debug = debug
self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
if bidirectional:
self.single_rnn_nums = num_layers*2
else:
self.single_rnn_nums = num_layers
self.fc1 = nn.Linear(hidden_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self,X):
"""
- X:2维索引矩阵
-
"""
if X.ndim != 2:
throw("X must be 2-dimensional")
# 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同
batch_size = X.shape[0]
x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding]
if self.debug:
print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256])
x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding]
if self.debug:
print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256])
h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size)
# h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量
out, hn = self.model(x, h0)
#根据RNN思想,最后一个单词的输出包含整个序列的信息
#意思就是最后一个单词的输出,也是整个序列的输出
#即hn中一个长度为hidden_size的向量就是一个序列的上下文向量
#现有batch_size个序列,应该有batch_size个长度为hidden_size的上下文向量
#需要将hn的shape转化为[batch_size,hidden_size]
#对hn的dim=0维度进行sum,使该维消失,正好满足需求
#hn的dim=0维是指几条单向RNN链,最后将所有的单向RNN链最后一个单词的输出,融合到一起
#这里选择了相加,融合,还有一种做法,就是拼接,将多个单向RNN的输出结果拼接到一起
# out = torch.sum(input=hn, dim=0)
out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size]
out = torch.sum(input=out, dim=0)
#全连接分类
out = self.fc1(out)
out = self.fc2(out)
return out
nn.ReLU()
ReLU(𝑥)=(𝑥)+=max(0,𝑥)
nn.PReLU()
PReLU(𝑥)=max(0,𝑥)+𝑎∗min(0,𝑥) ReLU直接舍弃负值 PReLU是给负值一个权重
relu在此场景的效果
ReLU 没有增益 PReLU 未验证
class RNNClassify1(nn.Module):
def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug):
super(RNNClassify1, self).__init__()
self.embedding = nn.Embedding(num_embeddings=dict_len,
embedding_dim=input_size,
padding_idx=0)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.debug = debug
self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
if bidirectional:
self.single_rnn_nums = num_layers*2
else:
self.single_rnn_nums = num_layers
self.dropout = nn.Dropout(p=0.5)
self.fc1 = nn.Linear(hidden_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self,X):
"""
- X:2维索引矩阵
-
"""
if X.ndim != 2:
throw("X must be 2-dimensional")
# 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同
batch_size = X.shape[0]
x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding]
if self.debug:
print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256])
x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding]
if self.debug:
print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256])
h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size)
# h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量
out, hn = self.model(x, h0)
out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size]
out = torch.sum(input=out, dim=0)
#全连接分类
out = self.fc1(out)
out = self.dropout(out)
out = self.fc2(out)
return out
p=0.5在这个以选负面单词为目的的场景中,
值过大,导致精度在86%-88%之间震荡
双层双向
RNN网络仍然是双层双向,这个不变,这个也可能是效果较好的
序列依赖的长度
序列过长会信息丢失严重,经验值为 序列长度超过20信息开始丢失
输出output与hn
这一步可以变,针对不同的业务效果可能会不同 # out = torch.sum(input=hn, dim=0) out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size] out = torch.sum(input=out, dim=0) 这里的效果是使用output好一些 在不对单词进行额外数据加工的情况下, GRU能达到89%,偶尔升到90%又很快会降到89%, 仅次于TEXTCNN的91% RNN可以达到88% 这里还没有实验接拼的效果 目前看,TextCNN是文本分类项目中那个又快又好的模型 快指训练速度快
激活函数ReLU、Leaky ReLU、PReLU和RReLU