Advance preparation
- Hold face Bert document huggingface. Co/transformer…
- Data set www.kaggle.com/c/jigsaw-to…
The data set is divided into six classes, multi-label problems
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
Copy the code
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
Copy the code
Start processing data…
df = pd.read_csv("train.csv")
df.head()
Copy the code
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()
new_df.head()
Copy the code
Defining hyperparameters
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Copy the code
Test the participle
test_one_sent = new_df.comment_text[0]
print(tokenizer.encode_plus(test_one_sent,
None,
add_special_tokens=True,
max_length=MAX_LEN,
pad_to_max_length=True,
return_token_type_ids=True))
Copy the code
The Dataset is defined using torch’s Dataset class
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.comment_text = dataframe.comment_text
self.targets = self.data.list
self.max_len = max_len
def __len__(self):
return len(self.comment_text)
def __getitem__(self, index):
comment_text = str(self.comment_text[index])
comment_text = " ".join(comment_text.split())
inputs = self.tokenizer.encode_plus(
comment_text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.float)
}
Copy the code
Use Torch’s DataLoader to define the iterated dataset
0.8 train_dataset train_size = = new_df. Sample (frac = train_size, random_state = 200) test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True) train_dataset = train_dataset.reset_index(drop=True) print("FULL Dataset: {}".format(new_df.shape)) print("TRAIN Dataset: {}".format(train_dataset.shape)) print("TEST Dataset: {}".format(test_dataset. Shape) ") Train_dataset = train_dataset[:2000] test_dataset = test_dataset[:100]Copy the code
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
Copy the code
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
Copy the code
Build the network BERT + DROPOUT + LINEAR; The Loss function is BCELogits Loss
class BERTClass(torch.nn.Module): def __init__(self): super(BERTClass, Self.) __init__ (self). The l1 = transformers. BertModel. From_pretrained (' Bert - base - uncased ') the self. The l2 = torch. Nn. Dropout (0.3) self.l3 = torch.nn.Linear(768, 6) def forward(self, ids, mask, token_type_ids): _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids) output_2 = self.l2(output_1) output = self.l3(output_2) return output model = BERTClass() model.to(device)Copy the code
The specification of BCEWithLogitsLoss
This class combines sigmoid and BCE Loss as follows:
N is batch size, which means converting multiple labels into multiple dichotomies
def loss_fn(outputs, targets):
return torch.nn.BCEWithLogitsLoss()(outputs, targets)
Copy the code
Test loss and see that they are equal
Y = torch. Tensor ([1, 1, 0.]) x = the torch. The tensor ([1., 0.9, 0.1]) xy_loss = loss_fn (x, Y) print(xy_loss) sigmoid_x = torch. Sigmoid (x) print(sigmoid_x) print(-(np.log(0.7311)+np.log(0.7109)+ NP.log (1-0.5250)) / 3)Copy the code
Defining the optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
Copy the code
Define the training process
from tqdm import tqdm
def train(epoch):
model.train()
for _,data in tqdm(enumerate(training_loader, 0)):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
outputs = model(ids, mask, token_type_ids)
loss = loss_fn(outputs, targets)
if _%5000==0:
print(f'Epoch: {epoch}, Loss: {loss.item()}')
optimizer.zero_grad()
loss.backward()
optimizer.step()
Copy the code
for epoch in range(EPOCHS):
train(epoch)
Copy the code
Define the validation process
def validation(epoch):
model.eval()
fin_targets=[]
fin_outputs=[]
with torch.no_grad():
for _, data in enumerate(testing_loader, 0):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
outputs = model(ids, mask, token_type_ids)
fin_targets.extend(targets.cpu().detach().numpy().tolist())
fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return fin_outputs, fin_targets
Copy the code
for epoch in range(EPOCHS): outputs, Targets = validation(epoch) outputs = (np.array(outputs) >= 0.3).astype(int) # metrics.accuracy_score(targets, outputs) f1_score_micro = metrics.f1_score(targets, outputs, average='micro') f1_score_macro = metrics.f1_score(targets, outputs, average='macro') print(f"Accuracy Score = {accuracy}") print(f"F1 Score (Micro) = {f1_score_micro}") print(f"F1 Score (Macro) = {f1_score_macro}")Copy the code
In the end:
- Accuracy Score = 0.89
- F1 Score (Micro) = 0.3870967741935484
- F1 Score (Macro) = 0.18253968253968253
It’s very bad because you’re only taking a little bit of data