本质上来说就是一个分类模型,然后喂不同的数据,尝试不同的模型,看最终的效果。
评论数据
1.尝试了取前1024个字符过textcnn
2.对用户的评论提取关键短语,取topk个短语过textcnn/textrnn
3.transformer + fc
4.Document bert
textcnn
最基础的模型了,如下图所示:
部分代码如下所示:
class TextCNN(nn.Module):
def __init__(self,pretrain_embed,vocab,config):
super(TextCNN,self).__init__()
self.config=config
self.vocab=vocab
# if pretrain_embed is not None:
# self.embedding=nn.Embedding.from_pretrained(pretrain_embed,padding_idx=len(vocab)-1,freeze=True)
# else:
self.embedding=nn.Embedding(len(vocab),config.embed_size,padding_idx=len(vocab)-1)
self.convs=nn.ModuleList([nn.Conv1d(config.embed_size,config.num_filters,kernel_size) for kernel_size in config.filter_sizes])
self.dropout=nn.Dropout(config.dropout)
self.fc = nn.Linear(config.num_filters*len(config.filter_sizes),len(config.labels))
def conv_and_pool(self,conv,x):
x=F.relu(conv(x)) #[batch_size,embed_size,seq_len] -> [batch_size,num_filters,?]
x=F.max_pool1d(x,x.size(2)).squeeze(2) #[batch_size,num_filters]
return x
# x:[batch_size]
def forward(self,x):
x,mask=x
x=self.embedding(x) # [batch_size,seq_len,embed_size]
x=x.permute(0,2,1)
x=torch.cat([self.conv_and_pool(conv,x) for conv in self.convs],1)
x=self.dropout(x)
x=self.fc(F.relu(x)) #
return x
LSTM
class LSTMModel(nn.Module):
def __init__(self,pretrain_embed,vocab,config):
super(LSTMModel,self).__init__()
self.config=config
self.vocab=vocab
if pretrain_embed is not None:
self.embedding=nn.Embedding.from_pretrained(pretrain_embed,padding_idx=len(vocab)-1,freeze=False)
else:
self.embedding=nn.Embedding(len(vocab),config.embed_size,padding_idx=len(vocab)-1)
self.dropout=nn.Dropout(config.dropout)
self.lstm = nn.LSTM(config.embed_size,config.hidden_size,config.num_layers,batch_first=True,dropout=config.dropout)
self.fc = nn.Linear(config.hidden_size,len(config.labels))
# x:[batch_size,seq_len]
def forward(self,x):
x,_=x
x=self.embedding(x) #[batch_size,seq_len,embed_size]
x=self.dropout(x)
x,_=self.lstm(x) # [batch_size,seq_len,hidden_size]
x=self.fc(x[:,-1,:]) # last output
return x
transformer + fc
transformer的attention部分如下:
暂无评论