“This is the 10th day of my participation in the Gwen Challenge in November. Check out the details: The Last Gwen Challenge in 2021.”
In this article, we will explain the training part of the code, the first two links: high reuse Bert model text classification code (1) data reading high reuse Bert model text classification code (2) model part
The entire training part of the code is in the trainer.py file, which has only one Trainer class.
In addition to the common trian and evaluate methods, the Trainer also includes save_model, load_model, save_results and other methods needed in experiments.
_init_
_init_ is used to configure parameters and load self.config_class.from_pretrained() for BertConfig Self-.config_class.from_pretrained () the bert-based classification model self.device is used to check if there is any available GPU in the computer
def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None) :
self.args = args
self.train_dataset = train_dataset
self.dev_dataset = dev_dataset
self.test_dataset = test_dataset
self.test_results = None
self.label_lst = get_labels(args)
# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]
self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.task)
self.model = self.model_class.from_pretrained(args.model_name_or_path,
config=self.config,
args=args,
label_lst=self.label_lst,
)
# GPU or CPU
self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
self.model.to(self.device)
Copy the code
train
Train part of the code is quite long, the first is of course to load self.train_dataset training set data
train_sampler = RandomSampler(self.train_dataset)
train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)
Copy the code
According to the parameters, the training rounds and gradient accumulation step length are controlled
if self.args.max_steps > 0:
t_total = self.args.max_steps
self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs
Copy the code
Set learning rate, optimizer, and warm_UP, weight_decay
Print Bert's information for each layer
for n, p in self.model.named_parameters():
print(n)
# BERT part parameter, set a lower learning rate
optimizer_grouped_parameters = []
bert_params = list(self.model.bert.named_parameters())
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias'.'LayerNorm.weight']
optimizer_grouped_parameters += [
{
'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay,
"lr": self.args.learning_rate,
},
{
'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)],
'weight_decay': 0.0.'lr': self.args.learning_rate,
}
]
# Linear layer parameters
linear_params = list(self.model.classifier.named_parameters())
no_decay = ['bias'.'LayerNorm.weight']
optimizer_grouped_parameters += [
{
'params': [p for n, p in linear_params if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay,
"lr": self.args.linear_learning_rate,
},
{
'params': [p for n, p in linear_params if any(nd in n for nd in no_decay)],
'weight_decay': 0.0.'lr': self.args.linear_learning_rate,
}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total)
Copy the code
Print basic training information
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d".len(self.train_dataset))
logger.info(" Num Epochs = %d", self.args.num_train_epochs)
logger.info(" Total train batch size = %d", self.args.train_batch_size)
logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", t_total)
logger.info(" Logging steps = %d", self.args.logging_steps)
logger.info(" Save steps = %d", self.args.save_steps)
Copy the code
12: Start the training in the inputs for the basic torch training. After optimizer.step() and scheduler.step() are updated, check whether the verification cycle is reached. If so, enter the evaluate verification stage, and save kappa as the default evaluation index in the verification set model code. Other evaluation indexes can be changed if they are mostly score.
wait = 0
global_step = 0
tr_loss = 0.0
best_score = 0.0
self.model.zero_grad()
train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration")
for step, batch in enumerate(epoch_iterator):
self.model.train()
batch = tuple(t.to(self.device) for t in batch) # GPU or CPU
inputs = {'input_ids': batch[0].'attention_mask': batch[1].'label_ids': batch[3],}ifself.args.model_type ! ='distilbert':
inputs['token_type_ids'] = batch[2]
outputs = self.model(**inputs)
loss = outputs[0]
if self.args.gradient_accumulation_steps > 1:
loss = loss / self.args.gradient_accumulation_steps
loss.backward()
tr_loss += loss.item()
if (step + 1) % self.args.gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
self.model.zero_grad()
global_step += 1
if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
results = self.evaluate("dev")
if best_score < results["kappa"]:
wait = 0
best_score = results["kappa"]
self.save_model()
else:
wait += 1
print("eraly stop {}/{}".format(wait,self.args.wait_patient))
if wait >= self.args.wait_patient:
break
# if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
# self.save_model()
if 0 < self.args.max_steps < global_step:
epoch_iterator.close()
break
if 0 < self.args.max_steps < global_step:
train_iterator.close()
break
return global_step, tr_loss / global_step
Copy the code
That’s the end of the training set
eval
Validation sets with code content are relatively simple and do not expand in detail
def evaluate(self, mode) :
if mode == 'test':
dataset = self.test_dataset
elif mode == 'dev':
dataset = self.dev_dataset
else:
raise Exception("Only dev and test dataset available")
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
# Eval!
logger.info("***** Running evaluation on %s dataset *****", mode)
logger.info(" Num examples = %d".len(dataset))
logger.info(" Batch size = %d", self.args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
self.model.eval(a)for batch in tqdm(eval_dataloader, desc="Evaluating"):
batch = tuple(t.to(self.device) for t in batch)
with torch.no_grad():
inputs = {'input_ids': batch[0].'attention_mask': batch[1].'label_ids': batch[3],}ifself.args.model_type ! ='distilbert':
inputs['token_type_ids'] = batch[2]
outputs = self.model(**inputs)
tmp_eval_loss, logits = outputs[:2]
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
# Intent prediction
if preds is None:
preds = logits.detach().cpu().numpy()
out_label_ids = inputs['label_ids'].detach().cpu().numpy()
else:
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
out_label_ids = np.append(
out_label_ids, inputs['label_ids'].detach().cpu().numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
results = {
"loss": round(eval_loss,7)}# Intent result
preds = np.argmax(preds, axis=1)
total_result = compute_metrics(preds, out_label_ids)
results.update(total_result)
if mode == 'test':
self.test_results = results
self.save_results()
logger.info("***** Eval results *****")
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
return results
Copy the code
save_model
Save the model via torch. Save
def save_model(self) :
# Save model checkpoint (Overwrite)
if not os.path.exists(self.args.model_dir):
os.makedirs(self.args.model_dir)
model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
model_to_save.save_pretrained(self.args.model_dir)
# Save training arguments together with the trained model
torch.save(self.args, os.path.join(self.args.model_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", self.args.model_dir)
Copy the code
load_model
The model is loaded
def load_model(self) :
# Check whether model exists
if not os.path.exists(self.args.model_dir):
raise Exception("Model doesn't exists! Train first!")
try:
self.model = self.model_class.from_pretrained(self.args.model_dir,
args=self.args,
label_lst=self.label_lst,)
self.model.to(self.device)
logger.info("***** Model Loaded *****")
except:
raise Exception("Some model files might be missing...")
Copy the code
save_results
Pandas is used to generate a CSV file to record the experimental results and save the results and parameters
def save_results(self) :
if not os.path.exists(self.args.results_dir):
os.makedirs(self.args.results_dir)
var = [self.args.task, self.args.learning_rate, self.args.num_train_epochs, self.args.max_seq_len, self.args.seed]
names = ['task'.'lr'.'epoch'.'max_len'.'seed']
vars_dict = {k: v for k, v in zip(names, var)}
results = dict(self.test_results, **vars_dict)
keys = list(results.keys())
values = list(results.values())
file_name = 'results.csv'
results_path = os.path.join(self.args.results_dir, file_name)
if not os.path.exists(results_path):
ori = []
ori.append(values)
df1 = pd.DataFrame(ori, columns=keys)
df1.to_csv(results_path, index=False)
else:
df1 = pd.read_csv(results_path)
new = pd.DataFrame(results, index=[1])
df1 = df1.append(new, ignore_index=True)
df1.to_csv(results_path, index=False)
data_diagram = pd.read_csv(results_path)
print('test_results', data_diagram)
Copy the code
The effect is as follows:
NLP cute new, shallow talent, mistakes or imperfect place, please criticize!!