Advance preparation

  1. Hold face Bert document huggingface. Co/transformer…
  2. Data set www.kaggle.com/c/jigsaw-to…

The data set is divided into six classes, multi-label problems

import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
Copy the code
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
Copy the code

Start processing data…

df = pd.read_csv("train.csv")
df.head()
Copy the code
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()
new_df.head()
Copy the code

Defining hyperparameters

MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Copy the code

Test the participle

test_one_sent = new_df.comment_text[0]
print(tokenizer.encode_plus(test_one_sent, 
                            None, 
                            add_special_tokens=True,
                            max_length=MAX_LEN,
                            pad_to_max_length=True,
                            return_token_type_ids=True))
Copy the code

The Dataset is defined using torch’s Dataset class

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
Copy the code

Use Torch’s DataLoader to define the iterated dataset

0.8 train_dataset train_size = = new_df. Sample (frac = train_size, random_state = 200) test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True) train_dataset = train_dataset.reset_index(drop=True) print("FULL Dataset: {}".format(new_df.shape)) print("TRAIN Dataset: {}".format(train_dataset.shape)) print("TEST Dataset: {}".format(test_dataset. Shape) ") Train_dataset = train_dataset[:2000] test_dataset = test_dataset[:100]Copy the code
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
Copy the code
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
Copy the code

Build the network BERT + DROPOUT + LINEAR; The Loss function is BCELogits Loss

class BERTClass(torch.nn.Module): def __init__(self): super(BERTClass, Self.) __init__ (self). The l1 = transformers. BertModel. From_pretrained (' Bert - base - uncased ') the self. The l2 = torch. Nn. Dropout (0.3) self.l3 = torch.nn.Linear(768, 6) def forward(self, ids, mask, token_type_ids): _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids) output_2 = self.l2(output_1) output = self.l3(output_2) return output model = BERTClass() model.to(device)Copy the code

The specification of BCEWithLogitsLoss

This class combines sigmoid and BCE Loss as follows:


l ( x . y ) = L = { l 1 . . . . . l N } T l(x,y)=L=\{l_1,… ,l_N\}^{T}


l n = w n [ y n log sigma ( x n ) + ( 1 y n ) log ( 1 sigma ( x n ) ) ] l_n=-w_n[y_n\cdot \log \sigma(x_n)+(1-y_n)\cdot \log(1-\sigma(x_n))]

N is batch size, which means converting multiple labels into multiple dichotomies

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
Copy the code

Test loss and see that they are equal

Y = torch. Tensor ([1, 1, 0.]) x = the torch. The tensor ([1., 0.9, 0.1]) xy_loss = loss_fn (x, Y) print(xy_loss) sigmoid_x = torch. Sigmoid (x) print(sigmoid_x) print(-(np.log(0.7311)+np.log(0.7109)+ NP.log (1-0.5250)) / 3)Copy the code

Defining the optimizer

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
Copy the code

Define the training process

from tqdm import tqdm

def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Copy the code
for epoch in range(EPOCHS):
    train(epoch)
Copy the code

Define the validation process

def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets
Copy the code
for epoch in range(EPOCHS): outputs, Targets = validation(epoch) outputs = (np.array(outputs) >= 0.3).astype(int) # metrics.accuracy_score(targets, outputs) f1_score_micro = metrics.f1_score(targets, outputs, average='micro') f1_score_macro = metrics.f1_score(targets, outputs, average='macro') print(f"Accuracy Score = {accuracy}") print(f"F1 Score (Micro) = {f1_score_micro}") print(f"F1 Score (Macro) = {f1_score_macro}")Copy the code

In the end:

  • Accuracy Score = 0.89
  • F1 Score (Micro) = 0.3870967741935484
  • F1 Score (Macro) = 0.18253968253968253

It’s very bad because you’re only taking a little bit of data