# Tricks to train big models

We will try to select some relatively big model and fine tune it on the custom text dataset with some tricks like checkpointing and offloading

![](https://raw.githubusercontent.com/oseledets/dl2023/main/seminars/seminar-10/neuro_kish.jpg)

## Prerequisites

### Dataset collection

In [None]:
!git clone https://github.com/oseledets/dl2023

In [1]:
MODEL_NAME = 'ai-forever/rugpt3small_based_on_gpt2' #@param ['ai-forever/rugpt3small_based_on_gpt2', 'ai-forever/rugpt3medium_based_on_gpt2','ai-forever/rugpt3large_based_on_gpt2', 'gpt2-large']
DATASET_PATH = './dl2023/seminars/seminar-10/kish_lyrics.txt'  #@param ['./dl2023/seminars/seminar-10/manowar_lyrics.txt','./dl2023/seminars/seminar-10/kish_lyrics.txt','./dl2023/seminars/seminar-10/korzh_lyrics.txt', './dl2023/seminars/seminar-10/oxxxy_lyrics.txt', './dl2023/seminars/seminar-10/pushkin.txt']

with open(DATASET_PATH) as text_file:
    text = text_file.read().splitlines()
    print(text)

['', 'Тёмный, мрачный коридор', 'Я на цыпочках, как вор', 'Пробираюсь, чуть дыша', 'Чтобы не спугнуть', 'Тех, кто спит уже давно', 'Тех, кому не всё равно', 'В чью я комнату тайком', 'Желаю заглянуть', 'Чтобы увидеть...', '', 'Как бессонница в час ночной', 'Меняет, нелюдимая, облик твой', 'Чьих, невольница, ты идей?', 'Зачем тебе охотиться на людей?', '', 'Крестик на моей груди', 'На него ты погляди', 'Что в тебе способен он', 'Резко изменить?', 'Много книжек я читал', 'Много фокусов видал', 'Свою тайну от меня', 'Не пытайся скрыть!', 'Я это видел!', 'Как бессонница в час ночной', 'Меняет, нелюдимая, облик твой', 'Чьих, невольница, ты идей?', 'Зачем тебе охотиться на людей?', '', 'Очень жаль, что ты тогда', 'Мне поверить не смогла', 'В то, что новый твой приятель', 'Не такой, как все!', 'Ты осталась с ним вдвоём', 'Не зная ничего о нём', 'Что для всех опасен он', 'Наплевать тебе!', 'И ты попала!', '', 'К настоящему колдуну', 'Он загубил таких, как ты, не одну!', 'Словно куклой, в час н

### Memory measuring
Ensure, that you are working on GPU (If you are in Colab, go ti Runtime -> Change runtime type)

In [2]:
# Simplest memory profiling
def gpu_mem():
    mem = torch.cuda.mem_get_info()
    mb = list(map(lambda x:x/pow(2,20),mem))
    total = mb[1]
    used = mb[1]-mb[0]
    return used,total

def gpu_mem_info(title = ''):
    used,total = gpu_mem()
    print(f'🤖 {title} gpu mem : {used:.1f}/{total:.1f} mb')

In [3]:
import torch
import pandas as pd
import time

class Profiler():

    def __init__(self,) -> None:
        pass

    def gpu_mem(self):
        mem = torch.cuda.mem_get_info()
        mb = list(map(lambda x:x/pow(2,20),mem))
        total = mb[1]
        used = mb[1]-mb[0]
        return used,total

    def gpu_mem_info(self,title = ''):
        used,total = self.gpu_mem()
        print(f'🤖 {title} gpu mem : {used:.1f}/{total:.1f} mb')

    def one_step_report(self,batch, model, optimizer, do_backward = True,device = torch.device('cpu'),print_loss = False,deepspeed = False):
    
        report_df = pd.DataFrame(columns=['used_mem','delta_mem','delta_time'])

        delta_time =[0]
        used_mem = [self.gpu_mem()[0]]

        self.gpu_mem_info('begin')

        model.train()
        
        ids = batch['input_ids'].to(device,dtype=torch.long)
        labels = batch['labels'].to(device,dtype=torch.long)
        
        torch.cuda.synchronize()
        start_time = time.time()

        outputs = model(input_ids = ids,labels = labels)
        loss = outputs[0]

        torch.cuda.synchronize()
        forward_time = time.time()
        delta_time.append(-start_time + forward_time)

        used_mem.append(self.gpu_mem()[0])
        self.gpu_mem_info(f'{delta_time[-1]:.3f}s forward')
        if do_backward:
            optimizer.zero_grad()
            if deepspeed:
                model.backward(loss)
            else:
                loss.backward()

            torch.cuda.synchronize()
            backward_time = time.time()
            delta_time.append(-forward_time + backward_time)
            used_mem.append( self.gpu_mem()[0])
            self.gpu_mem_info(f'{delta_time[-1]:.3f}s backward')

            if deepspeed:
                model.step()
            else:
                optimizer.step()

            torch.cuda.synchronize()
            optimizer_step_time = time.time()
            delta_time.append(-backward_time + optimizer_step_time)
            used_mem.append( self.gpu_mem()[0])
            self.gpu_mem_info(f'{delta_time[-1]:.3f}s optimizer_step')
        
        if (print_loss):
            print('loss',loss)

        torch.cuda.empty_cache() 
        used_mem.append( self.gpu_mem()[0])
        torch.cuda.synchronize()
        end_time = time.time()
        delta_time.append(end_time - optimizer_step_time)
        # 
        report_df.loc[:,'used_mem'] = pd.Series(used_mem)
        report_df.loc[:,'delta_time'] = pd.Series(delta_time)
        indexes = ['begin','forward','backward','optim_step','end']
        report_df.index = indexes

        report_df['delta_mem'] =  report_df['used_mem']- report_df.loc['begin','used_mem']

        report_df.loc['total'] = [self.gpu_mem()[1],0,end_time-start_time]
        report_df['delta_time'] = report_df['delta_time'].map(lambda t : round(t,3))
        
        return report_df

prof = Profiler()

prof.gpu_mem()
# (5804.0, 15109.75)

prof.gpu_mem_info() 
# gpu mem : 5840.0/15109.8 mb


# report = prof.one_step_report(batch, model,optim,device = DEVICE)
# # begin gpu mem : 5804.0/15109.8 mb
# # 0.050s forward gpu mem : 13006.0/15109.8 mb
# # 1.232s backward gpu mem : 14576.0/15109.8 mb
# # 0.025s optimizer_step gpu mem : 14576.0/15109.8 mb

# report

🤖  gpu mem : 103.0/15101.8 mb


### Imports

In [24]:
!pip install -q sentencepiece
!pip install -q transformers  datasets
!pip install -q accelerate
!pip install -q deepspeed mpi4py
!pip install -q pynvml
!pip install -q wandb

## Model loading

In [4]:
import torch

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import transformers

from sklearn.model_selection import train_test_split

import time
import pandas as pd


import random

import deepspeed

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"🤖 Working on {DEVICE}")

model_name_or_path = MODEL_NAME
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)

🤖 Working on cuda


In [5]:
CHECKPOINTING = False
if CHECKPOINTING:
    model.gradient_checkpointing_enable()
    model.config.use_cache = False

In [6]:
print(f"🤖The total number of parameters in the model is {model.num_parameters()}")
gpu_mem_info()

🤖The total number of parameters in the model is 125231616
🤖  gpu mem : 673.0/15101.8 mb


## Dataset preparation

In [5]:
if 'pushkin' in DATASET_PATH or 'mayakovskiy' in DATASET_PATH:
    tokenizer.add_tokens('</s>')
    tokenizer.add_special_tokens({
        'eos_token': '</s>',
        'pad_token': '<pad>'
    })
if 'lyrics' in DATASET_PATH:
    tokenizer.add_tokens('[EOS]')
    tokenizer.add_special_tokens({
        'eos_token': '[EOS]',
        'pad_token': '<pad>'
    })

model.resize_token_embeddings(len(tokenizer))

# Dataset
train_dataset = TextDataset(tokenizer=tokenizer,file_path=DATASET_PATH,block_size=512)
train_dataset, eval_dataset = train_test_split(train_dataset,test_size = 0.1,random_state = 42)
  
# Creating a data_collator (slices the text into optimal length pieces)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [6]:
# Testing
tokenizer.decode(train_dataset[0])

' знак\n\nЖивых, как ни странно, в посёлке нету — знакомый почерк он узнал\nБывал ли я здесь? Ответ на это сам себе маньяк не дал\nЧто с памятью стало? Отказала, чувство снова подсказало\nТут кто-то был, о ком он напрочь позабыл — всё позабыл\n\nНикого вокруг, только сердца стук\nИ во власти рук заточенный сук — его верный друг\nЧто-то здесь не так, будто рядом враг\nКак свинец кулак, подруга луна, подай верный знак\nНо кто-то сзади, потехи ради, по рукоятку вонзил в лопатку\nРжавый тесак — и рухнул замертво маньяк, не обернувшись\nНад телом согнувшись и усмехнувшись\nВоскликнул кто-то лиха работа\nМой ученик, ты плохо ремесло постиг — зашёл в тупик\n\nНикого вокруг, только сердца стук\nИ во власти рук заточенный сук — его верный друг\nЧто-то здесь не так, будто рядом враг\nКак свинец кулак, подруга луна, подай верный знак\nНикого вокруг, только сердца стук\nИ во власти рук заточенный сук — его верный друг\nЧто-то здесь не так, будто рядом враг\nКак свинец кулак, подруга луна, подай ве

## Training

In [7]:
batch_size = 2
n_epochs = 6
training_args = TrainingArguments(
    output_dir="./finetuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=n_epochs, # number of training epochs

    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=4, # to make "virtual" batch size larger
    skip_memory_metrics = False,
    evaluation_strategy  = 'epoch',
    logging_strategy  = 'epoch',
    save_strategy  = 'epoch',
    # auto_find_batch_size = True,
    load_best_model_at_end = True,
    # deepspeed = 'ds_config.json'
    )


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset = eval_dataset
    
    # optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [9]:
trainer_log = trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mbratishka[0m ([33mskoltech_optimization[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,3.3289,2.945365
2,3.0193,2.843827
3,2.79,2.802157
4,2.6276,2.771532
5,2.5212,2.771832
6,2.4622,2.766451


## Generation

In [13]:
# Probabilistic sample example
text = "Пришёл я ночью в Сколтех"
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        do_sample=True,
                        num_beams=4,
                        temperature=2.5,
                        top_p=0.9,
                        max_length=200,
                        )

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Пришёл я ночью в Сколтех, я, к примеру,
Смотрел на свою дверь в
Кольце Тьмы, то и дело выглядывал:
А за дверью темно и холодно
Чего-то я там не вижу:
Что за напасть такая?

Что там такое происходит, зачем мне это знать?
Коль скоро дверь открыта -
Я здесь, в доме своём, в одиночестве.
И вот сижу я на полу, смотрю на
Всё вокруг вижу, в том я уверен:
Сверху кто-то смотрит, в том не я
Это значит - на меня
Взглянули мои соседи:
Что-то я им не понравился:
Всюду, кто-то рычит, кто-то стонет,
Что в них, черт возьми, скрывается
И что они могут нам посоветовать
Если, конечно, им это под силу
В моём случае - вам помочь
Но мне это надо, поверьте!



## Memory measurement without checkpointing

In [14]:
CHECKPOINTING = False
if not CHECKPOINTING:
    model.gradient_checkpointing_disable()
    model.config.use_cache = True
b_size = 2
random.shuffle(train_dataset)
batch = data_collator(train_dataset[:b_size])
optim = torch.optim.Adam(model.parameters(),lr=1e-5)

prof = Profiler()
prof.one_step_report(batch, model,optim,device = DEVICE)

🤖 begin gpu mem : 3939.0/15101.8 mb
🤖 0.137s forward gpu mem : 5657.0/15101.8 mb
🤖 0.200s backward gpu mem : 6051.0/15101.8 mb
🤖 0.050s optimizer_step gpu mem : 6051.0/15101.8 mb


  report_df.loc[:,'delta_time'] = pd.Series(delta_time)


Unnamed: 0,used_mem,delta_mem,delta_time
begin,3939.0,0.0,0.0
forward,5657.0,1718.0,0.137
backward,6051.0,2112.0,0.2
optim_step,6051.0,2112.0,0.05
end,4317.0,378.0,0.045
total,15101.8125,0.0,0.432


## Memory measurement with checkpointing

In [15]:
CHECKPOINTING = True
if CHECKPOINTING:
    model.gradient_checkpointing_enable()
    model.config.use_cache = False

In [16]:
b_size = 2
random.shuffle(train_dataset)
batch = data_collator(train_dataset[:b_size])
optim = torch.optim.Adam(model.parameters(),lr=1e-5)

prof = Profiler()
prof.one_step_report(batch, model,optim,device = DEVICE)

🤖 begin gpu mem : 4317.0/15101.8 mb
🤖 0.143s forward gpu mem : 4515.0/15101.8 mb
🤖 0.281s backward gpu mem : 4515.0/15101.8 mb
🤖 0.053s optimizer_step gpu mem : 4735.0/15101.8 mb


  report_df.loc[:,'delta_time'] = pd.Series(delta_time)


Unnamed: 0,used_mem,delta_mem,delta_time
begin,4317.0,0.0,0.0
forward,4515.0,198.0,0.143
backward,4515.0,198.0,0.281
optim_step,4735.0,418.0,0.053
end,4015.0,-302.0,0.015
total,15101.8125,0.0,0.493


In [13]:
transformers.deepspeed.is_deepspeed_zero3_enabled()

False

## Let us add the fp16

In [17]:
batch_size = 2
n_epochs = 6
training_args = TrainingArguments(
    output_dir="./finetuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=n_epochs, # number of training epochs

    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=4, # to make "virtual" batch size larger
    skip_memory_metrics = False,
    evaluation_strategy  = 'epoch',
    logging_strategy  = 'epoch',
    save_strategy  = 'epoch',
    # auto_find_batch_size = True,
    load_best_model_at_end = True,
    # deepspeed = 'ds_config.json'
    fp16=True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset = eval_dataset
    
    # optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)


In [18]:
CHECKPOINTING = True
if CHECKPOINTING:
    model.gradient_checkpointing_enable()
    model.config.use_cache = False

In [19]:
b_size = 2
random.shuffle(train_dataset)
batch = data_collator(train_dataset[:b_size])
optim = torch.optim.Adam(model.parameters(),lr=1e-5)

prof = Profiler()
prof.one_step_report(batch, model,optim,device = DEVICE)

🤖 begin gpu mem : 3817.0/15101.8 mb
🤖 0.140s forward gpu mem : 4017.0/15101.8 mb
🤖 0.294s backward gpu mem : 4215.0/15101.8 mb
🤖 0.049s optimizer_step gpu mem : 4215.0/15101.8 mb


  report_df.loc[:,'delta_time'] = pd.Series(delta_time)


Unnamed: 0,used_mem,delta_mem,delta_time
begin,3817.0,0.0,0.0
forward,4017.0,200.0,0.14
backward,4215.0,398.0,0.294
optim_step,4215.0,398.0,0.049
end,2819.0,-998.0,0.03
total,15101.8125,0.0,0.514


## DeepSpeed tricks

In [75]:
%%bash

cat <<'EOT' > ds_config.json
{
    "train_batch_size": "auto",
    "gradient_accumulation_steps": "auto",
    "zero_force_ds_cpu_optimizer": false,


    "zero_optimization": {
        "stage": 3,
        "contiguous_gradients": true,
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_prefetch_bucket_size": 1e7,
        "stage3_param_persistence_threshold": 1e5,
        "reduce_bucket_size": 1e7,
        "sub_group_size": 1e9,
        "offload_optimizer": {
            "device": "cpu"
            },
        "offload_param": {
            "device": "cpu"
            }
        }
}
EOT

In [55]:
import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

In [None]:
# This is broken for the moment

# b_size = 8
# # step_model = engine[0].module
# random.shuffle(train_dataset)
# batch = data_collator(train_dataset[:b_size])

# optim = torch.optim.Adam(model.parameters(),lr=1e-5)


# from transformers.deepspeed import HfDeepSpeedConfig
# from transformers import AutoModel
# import deepspeed

# ds_config = 'ds_config.json'  # deepspeed config object or path to the file
# # must run before instantiating the model to detect zero 3
# dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
# engine = deepspeed.initialize(model=model, optimizer = optim,config_params=ds_config)

In [76]:
batch_size = 2
n_epochs = 6
training_args = TrainingArguments(
    output_dir="./finetuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=n_epochs, # number of training epochs

    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=4, # to make "virtual" batch size larger
    skip_memory_metrics = False,
    evaluation_strategy  = 'epoch',
    logging_strategy  = 'epoch',
    save_strategy  = 'epoch',
    # auto_find_batch_size = True,
    load_best_model_at_end = True,
    deepspeed = 'ds_config.json'
    )


In [77]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset = eval_dataset
    
    # optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [78]:
trainer_log = trainer.train()



Using /root/.cache/torch_extensions/py39_cu118 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py39_cu118/utils...
Emitting ninja build file /root/.cache/torch_extensions/py39_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module utils...


Time to load utils op: 74.96789717674255 seconds
Parameter Offload: Total persistent parameters: 121344 in 98 params


Using /root/.cache/torch_extensions/py39_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...


Time to load utils op: 0.010263919830322266 seconds


Epoch,Training Loss,Validation Loss


## Useful links

1. [Tokenizers tutorial](https://huggingface.co/docs/transformers/tokenizer_summary) - brief analysis of all types of tokenizers from Huggingface with examples.
1. [How to generate text](https://huggingface.co/blog/how-to-generate) - overview of how to sample text using language models (bimsurch, etc.).
1. [Attention is All You Need](https://arxiv.org/pdf/1706.03762.pdf) - original article about the first Transformer.
1. [GPT-1](https://openai.com/blog/language-unsupervised/) - an article on OpenAI blog about GPT-1.
1. [GPT-2](https://openai.com/blog/better-language-models/) - OpenAI blog article about GPT-2.
1. [GPT-3](https://openai.com/blog/gpt-3-apps/) - OpenAI blog article about GPT-3.
1. [WebGPT](https://openai.com/blog/improving-factual-accuracy/) - OpenAI blog article about GPT-3, trained to google.
1. [Codex](https://openai.com/blog/openai-codex/) - OpenAI blog article about GPT-3 trained to write code.