import matplotlib.patches as patches
import numpy as np
from datasets import Dataset
import torch
from torch.profiler import ProfilerActivity
from transformers import AutoModelForSequenceClassification
import gc
from pynvml import (nvmlInit,
                    nvmlDeviceGetHandleByIndex,
                    nvmlDeviceGetMemoryInfo)
from transformers import TrainingArguments
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
import pandas as pd
import matplotlib.pyplot as plt

def print_gpu_utilization():
    gc.collect()
    torch.cuda.empty_cache()
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print("############################################")
    print("GPU memory statistics")
    print("GPU memory occupied: "+f"{info.used/1024**2:.2f} MiB".rjust(23))
    print("Reserved GPU memory by torch: " +
          f"{torch.cuda.memory_reserved()/1024**2:.2f} MiB".rjust(14))
    print("Allocated GPU memory by torch: " +
          f"{torch.cuda.memory_allocated()/1024**2:.2f} MiB".rjust(13))
    print("############################################\n")

print_gpu_utilization()

############################################
GPU memory statistics
GPU memory occupied:              895.04 MiB
Reserved GPU memory by torch:       0.00 MiB
Allocated GPU memory by torch:      0.00 MiB
############################################

nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(handle)
default_gpu_occupation = info.used

default_gpu_occupation += 300 * 1024**2

# Define main parameters for this script
batch_size = 4
seq_len = 512
mdl = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

# %% Inspect dtype of BERT
dtype = mdl.bert.embeddings.word_embeddings.weight.dtype
print(f"\nDtype of bert weights: {dtype}")
element_size = mdl.bert.embeddings.word_embeddings.weight.element_size()
print(f"Number of bytes of single bert weight: {element_size}")
# Test if every model weight is of the same data type
for par in mdl.parameters():
    assert par.dtype == torch.float32

Dtype of bert weights: torch.float32
Number of bytes of single bert weight: 4

# %% Get expected size of BERT weights
expected_memory_model_weights = 0
for par in mdl.parameters():
    expected_memory_model_weights += element_size * par.numel()
expected_memory_model_weights = expected_memory_model_weights

print(
    f"Expected GPU usage by storing model weigths: "
    f"{expected_memory_model_weights / 1024**2:.2f} MiB")

Expected GPU usage by storing model weigths: 1278.47 MiB

word_embeddings_shape = mdl.bert.embeddings.word_embeddings.weight.shape
expected_size_word_embeddings = word_embeddings_shape[0] * \
    word_embeddings_shape[1] * element_size

print(
    f"Memory allocated to GPU before adding embedding weights: "
    f"{torch.cuda.memory_allocated() / 1024**2:.2f} MiB")
tmp = mdl.bert.embeddings.word_embeddings.weight.to("cuda")
allocated_memory_act = torch.cuda.memory_allocated()
print(
    f"Memory allocated to GPU after adding embedding weights: "
    f"{torch.cuda.memory_allocated() / 1024**2:.2f} MiB")
print(
    f"Expected memory allocated to GPU after adding embedding weights: "
    f"{expected_size_word_embeddings / 1024**2:.2f} MiB")

Memory allocated to GPU before adding embedding weights: 0.00 MiB
Memory allocated to GPU after adding embedding weights: 120.00 MiB
Expected memory allocated to GPU after adding embedding weights: 119.23 MiB

additional_memory_word_embeddings = allocated_memory_act - \
    expected_size_word_embeddings
expected_memory_model_weights += additional_memory_word_embeddings

del tmp
gc.collect()
torch.cuda.empty_cache()

hidden_size = mdl.config.hidden_size
intermediate_size = mdl.config.intermediate_size
num_hidden_layers = mdl.config.num_hidden_layers
num_attention_heads = mdl.config.num_attention_heads

print(f"Hidden size: {hidden_size}")
print(f"Intermediate size: {intermediate_size}")
print(f"# hidden layers: {num_hidden_layers}")
print(f"# attention heads: {num_attention_heads}")

Hidden size: 1024
Intermediate size: 4096
# hidden layers: 24
# attention heads: 16

def getNumElementsPerFfwTensor(batch_size, seq_len, model):
    # Get model configurations
    hidden_size = model.config.hidden_size
    intermediate_size = model.config.intermediate_size
    num_hidden_layers = model.config.num_hidden_layers
    num_attention_heads = model.config.num_attention_heads

    # Store number of elements of all tensors during forward pass
    numel_ffw_tensors = {}
    # Input
    numel_ffw_tensors["input"] = 3 * batch_size * seq_len
    # Embedding
    numel_ffw_tensors["embeddings_add"] = batch_size * seq_len * hidden_size
    numel_ffw_tensors["embeddings_layer_norm_bias"] = batch_size * seq_len
    numel_ffw_tensors["embeddings_layer_norm_shift"] = batch_size * seq_len
    numel_ffw_tensors["embeddings_layer_norm"] = batch_size * \
        seq_len * hidden_size
    numel_ffw_tensors["embeddings_dropout"] = batch_size * \
        seq_len * hidden_size
    # Encoder Layers
    for layer_idx in range(num_hidden_layers):
        numel_ffw_tensors[f"layer_{layer_idx}_attention_self_query_dense"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_attention_self_key_dense"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_attention_self_value_dense"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_attention_self_scores"] = \
            num_attention_heads * batch_size * seq_len * seq_len
        numel_ffw_tensors[f"layer_{layer_idx}_attention_self_probs"] = \
            num_attention_heads * batch_size * seq_len * seq_len
        numel_ffw_tensors[
            f"layer_{layer_idx}_attention_self_probs_dropout"] = \
            num_attention_heads * batch_size * seq_len * seq_len
        numel_ffw_tensors[f"layer_{layer_idx}_attention_self_context"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_attention_output_dense"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[
            f"layer_{layer_idx}_attention_output_layer_norm_bias"] = \
            batch_size * seq_len
        numel_ffw_tensors[
            f"layer_{layer_idx}_attention_output_layer_norm_shift"] = \
            batch_size * seq_len
        numel_ffw_tensors[f"layer_{layer_idx}_attention_output_layer_norm"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_attention_output_dropout"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_intermediate_dense"] = \
            batch_size * seq_len * intermediate_size
        numel_ffw_tensors[f"layer_{layer_idx}_intermediate_gelu"] = \
            batch_size * seq_len * intermediate_size
        numel_ffw_tensors[f"layer_{layer_idx}_output_dense"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_output_layer_norm_bias"] = \
            batch_size * seq_len
        numel_ffw_tensors[f"layer_{layer_idx}_output_layer_norm_shift"] = \
            batch_size * seq_len
        numel_ffw_tensors[f"layer_{layer_idx}_output_layer_norm"] = \
            batch_size * seq_len * hidden_size
        numel_ffw_tensors[f"layer_{layer_idx}_output_dropout"] = \
            batch_size * seq_len * hidden_size
    # Pooler
    numel_ffw_tensors["pooler_dense"] = batch_size * hidden_size
    numel_ffw_tensors["pooler_tanh"] = batch_size * hidden_size
    # Classifier
    numel_ffw_tensors["dropout"] = batch_size * hidden_size
    numel_ffw_tensors["target"] = batch_size
    numel_ffw_tensors["classifier"] = batch_size
    return numel_ffw_tensors

def getExpectedMemoryPerFfwTensor(batch_size, seq_len, model, element_size):
    # Get size per feed forward tensor
    numel_ffw_tensors = getNumElementsPerFfwTensor(batch_size, seq_len, model)
    expected_memory_per_ffw_tensor = {}
    for ffw_tensor in numel_ffw_tensors:
        if "dropout" in ffw_tensor:
            expected_memory_per_ffw_tensor[ffw_tensor] = \
                numel_ffw_tensors[ffw_tensor]
        else:
            expected_memory_per_ffw_tensor[ffw_tensor] = \
                numel_ffw_tensors[ffw_tensor] * element_size
    return expected_memory_per_ffw_tensor

def getExpectedTotalMemoryFfwTensors(batch_size, seq_len, model, element_size):
    expected_memory_per_ffw_tensor = getExpectedMemoryPerFfwTensor(
        batch_size, seq_len, model, element_size)
    # Calculate memory
    expected_total_memory_ffw_tensors = sum(
        list(expected_memory_per_ffw_tensor.values()))
    return expected_total_memory_ffw_tensors

print(
    f"Expected GPU usage by storing model weigths: "
    f"{expected_memory_model_weights / 1024**2:.2f} MiB")
# Gradient memory
expected_memory_gradients = expected_memory_model_weights  # Gradients
print(
    f"Expected GPU usage by storing gradients of model weights: "
    f"{expected_memory_gradients / 1024**2:.2f} MiB")
# Optimizer memory
expected_memory_gradient_moments = 2 * \
    expected_memory_model_weights  # Both Adam Moments
print(
    f"Expected GPU usage by storing ADAM moments: "
    f"{expected_memory_gradient_moments / 1024**2:.2f} MiB")
# Forward pass memory
expected_total_memory_ffw_tensors = getExpectedTotalMemoryFfwTensors(
    batch_size, seq_len, mdl, element_size)
print(
    f"Expected GPU usage by storing forward pass tensors: "
    f"{expected_total_memory_ffw_tensors / 1024**2:.2f} MiB")
# cuBLAS workspace size
cublas_workspace_size = 4096 * 1024 * 2 + 16 * 1024 * 8
print(
    f"Expected GPU usage for cuBLAS workspace: "
    f"{cublas_workspace_size / 1024**2:.2f} MiB")

Expected GPU usage by storing model weigths: 1279.25 MiB
Expected GPU usage by storing gradients of model weights: 1279.25 MiB
Expected GPU usage by storing ADAM moments: 2558.49 MiB
Expected GPU usage by storing forward pass tensors: 6642.82 MiB
Expected GPU usage for cuBLAS workspace: 8.12 MiB

def train_bert(model_name,
               batch_size,
               seq_len,
               gradient_checkpointing=False,
               fp16=False,
               gradient_accumulation_steps=1,
               num_iterations=2):
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-large-uncased")
    # Create arbitrary train data
    dataset_size = 512
    dummy_data = {
        "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
        "labels": np.random.randint(0, 1, (dataset_size)),
    }
    ds = Dataset.from_dict(dummy_data)
    ds.set_format("pt")
    with torch.profiler.profile(
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/bert'),
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    ) as prof:
        default_args = {
            "output_dir": "tmp",
            "evaluation_strategy": "steps",
            "num_train_epochs": 1,
            "log_level": "error",
            "report_to": "none",
        }
        training_args = TrainingArguments(
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            gradient_checkpointing=gradient_checkpointing,
            fp16=fp16,
            **default_args,
        )
        dataloader = DataLoader(ds,
                                batch_size=batch_size)
        if training_args.gradient_checkpointing:
            model.gradient_checkpointing_enable()

        # Define optimizer to AdamW
        optimizer = AdamW(model.parameters(), lr=1e-5)
        print("Put model on GPU")
        print(
            f"Expected GPU usage by storing model weigths: "
            f"{expected_memory_model_weights / 1024**2:.2f} MiB")
        actual_occupied_memory_by_tensors = torch.cuda.memory_allocated()
        model.to("cuda")
        mem_act = torch.cuda.memory_allocated() - \
            actual_occupied_memory_by_tensors
        print(f"Actual GPU usage by storing model weigths: "
              f"{mem_act / 1024**2:.2f} MiB")
        actual_occupied_memory_by_tensors = torch.cuda.memory_allocated()
        print_gpu_utilization()

        # Enable train modos of model
        model.train()
        for step, batch in enumerate(dataloader, start=1):
            # Need to call this at each step to
            # notify profiler of steps' boundary.
            prof.step()
            if step > num_iterations:
                break

            # Get batch
            batch = {k: batch[k].to("cuda") for k in batch}

            # Forward pass
            if step == 1:
                exp_net_additional_gpu_usage = \
                    expected_total_memory_ffw_tensors + cublas_workspace_size
                print(f"Forward pass iteration {step}")
                print("Expected additional GPU usage "
                      "by storing feedforward tensors: "
                      f"{expected_total_memory_ffw_tensors / 1024**2:.2f} MiB")
                print(f"Expected additional GPU usage for cuBLAS workspace: "
                      f"{cublas_workspace_size / 1024**2:.2f} MiB")
                print(f"Expected net additional GPU usage: "
                      f"{exp_net_additional_gpu_usage / 1024**2:.2f} MiB")
            else:
                print(f"Forward pass iteration {step}")
                print(f"Expected net additional GPU usage: "
                      f"{expected_total_memory_ffw_tensors / 1024**2:.2f} MiB")
            loss = model(**batch).loss
            loss = loss / training_args.gradient_accumulation_steps
            act_net_additional_gpu_usage = \
                torch.cuda.memory_allocated() - \
                actual_occupied_memory_by_tensors
            print(f"Actual net additional GPU usage: "
                  f"{act_net_additional_gpu_usage / 1024**2:.2f} MiB")
            actual_occupied_memory_by_tensors = torch.cuda.memory_allocated()
            print_gpu_utilization()

            # Backward pass
            if step == 1:
                exp_net_diff_gpu_mem = \
                    expected_memory_gradients + cublas_workspace_size - \
                    expected_total_memory_ffw_tensors
                print(f"Backward pass iteration {step}")
                print("Expected released GPU memory by deleting ffw tensors: "
                      f"{expected_total_memory_ffw_tensors / 1024**2:.2f} MiB")
                print("Expected additional GPU usage by "
                      "adding weight gradients: "
                      f"{expected_memory_gradients / 1024**2:.2f} MiB")
                print(f"Expected additional GPU usage for cuBLAS workspace: "
                      f"{cublas_workspace_size / 1024**2:.2f} MiB")
                print(f"Expected net difference in GPU memory: "
                      f"{exp_net_diff_gpu_mem / 1024**2:.2f} MiB")
            else:
                exp_net_diff_gpu_mem = \
                    expected_memory_gradients - \
                    expected_total_memory_ffw_tensors
                print(f"Backward pass iteration {step}")
                print("Expected released GPU memory by deleting ffw tensors: "
                      f"{expected_total_memory_ffw_tensors / 1024**2:.2f} MiB")
                print("Expected additional GPU usage "
                      "by adding weight gradients: "
                      f"{expected_memory_gradients / 1024**2:.2f} MiB")
                print(f"Expected net difference in GPU memory: "
                      f"{exp_net_diff_gpu_mem / 1024**2:.2f} MiB")
            loss.backward()
            act_net_diff_gpu_mem = \
                torch.cuda.memory_allocated() - \
                actual_occupied_memory_by_tensors
            print(f"Actual net difference in GPU memory: "
                  f"{act_net_diff_gpu_mem / 1024**2:.2f} MiB")
            actual_occupied_memory_by_tensors = torch.cuda.memory_allocated()
            print_gpu_utilization()

            # Optimizer step
            if step % training_args.gradient_accumulation_steps == 0:
                print(f"Optimizer step iteration {step}")
                print("When called the first time, Adam's first and "
                      "second moments of gradients "
                      "are initialized and added to GPU memory.")
                if step == 1:
                    print("Expected additional GPU usage by Adam moments: "
                          f"{expected_memory_gradient_moments / 1024**2:.2f} "
                          "MiB")
                else:
                    print("Expected additional GPU "
                          "usage by Adam moments: 0 MiB")
                optimizer.step()
                act_add_gpu_mem_adam = \
                    torch.cuda.memory_allocated() - \
                    actual_occupied_memory_by_tensors
                print(f"Actual additional GPU usage by Adam moments: "
                      f"{act_add_gpu_mem_adam / 1024**2:.2f} MiB")
                actual_occupied_memory_by_tensors = \
                    torch.cuda.memory_allocated()
                print_gpu_utilization()
                # Set gradients to zero
                print(f"Zero grad iteration {step}")
                print(f"Expected released GPU memory by deleting gradients: "
                      f"{expected_memory_gradients / 1024**2:.2f} MiB")
                optimizer.zero_grad()
                act_rel_gpu_mem_grad = \
                    actual_occupied_memory_by_tensors - \
                    torch.cuda.memory_allocated()
                print(f"Actual released GPU memory by deleting gradients: "
                      f"{act_rel_gpu_mem_grad / 1024**2:.2f} MiB")
                actual_occupied_memory_by_tensors = \
                    torch.cuda.memory_allocated()
                print_gpu_utilization()

print("GPU utilization before training starts:")
print_gpu_utilization()
train_bert("bert-large-uncased", batch_size, seq_len)
# %% Write function to get temporary occupation
# of GPU memory for gradient calculations

GPU utilization before training starts:
############################################
GPU memory statistics
GPU memory occupied:             1628.83 MiB
Reserved GPU memory by torch:       0.00 MiB
Allocated GPU memory by torch:      0.00 MiB
############################################

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Put model on GPU
Expected GPU usage by storing model weigths: 1279.25 MiB
Actual GPU usage by storing model weigths: 1279.25 MiB
############################################
GPU memory statistics
GPU memory occupied:             2923.20 MiB
Reserved GPU memory by torch:    1290.00 MiB
Allocated GPU memory by torch:   1279.25 MiB
############################################

Forward pass iteration 1
Expected additional GPU usage by storing feedforward tensors: 6642.82 MiB
Expected additional GPU usage for cuBLAS workspace: 8.12 MiB
Expected net additional GPU usage: 6650.95 MiB
Actual net additional GPU usage: 6650.94 MiB
############################################
GPU memory statistics
GPU memory occupied:            10244.72 MiB
Reserved GPU memory by torch:    8244.00 MiB
Allocated GPU memory by torch:   7930.20 MiB
############################################

Backward pass iteration 1
Expected released GPU memory by deleting ffw tensors: 6642.82 MiB
Expected additional GPU usage by adding weight gradients: 1279.25 MiB
Expected additional GPU usage for cuBLAS workspace: 8.12 MiB
Expected net difference in GPU memory: -5355.45 MiB
Actual net difference in GPU memory: -5355.43 MiB
############################################
GPU memory statistics
GPU memory occupied:             4980.72 MiB
Reserved GPU memory by torch:    2972.00 MiB
Allocated GPU memory by torch:   2574.77 MiB
############################################

Optimizer step iteration 1
When called the first time, Adam's first and second moments of gradients are initialized and added to GPU memory.
Expected additional GPU usage by Adam moments: 2558.49 MiB
Actual additional GPU usage by Adam moments: 2558.49 MiB
############################################
GPU memory statistics
GPU memory occupied:             7162.72 MiB
Reserved GPU memory by torch:    5154.00 MiB
Allocated GPU memory by torch:   5133.26 MiB
############################################

Zero grad iteration 1
Expected released GPU memory by deleting gradients: 1279.25 MiB
Actual released GPU memory by deleting gradients: 1279.25 MiB
############################################
GPU memory statistics
GPU memory occupied:             6002.72 MiB
Reserved GPU memory by torch:    3994.00 MiB
Allocated GPU memory by torch:   3854.01 MiB
############################################

Forward pass iteration 2
Expected net additional GPU usage: 6642.82 MiB
Actual net additional GPU usage: 6642.80 MiB
############################################
GPU memory statistics
GPU memory occupied:            12934.78 MiB
Reserved GPU memory by torch:   10926.00 MiB
Allocated GPU memory by torch:  10496.82 MiB
############################################

Backward pass iteration 2
Expected released GPU memory by deleting ffw tensors: 6642.82 MiB
Expected additional GPU usage by adding weight gradients: 1279.25 MiB
Expected net difference in GPU memory: -5363.58 MiB
Actual net difference in GPU memory: -5363.56 MiB
############################################
GPU memory statistics
GPU memory occupied:             7578.78 MiB
Reserved GPU memory by torch:    5570.00 MiB
Allocated GPU memory by torch:   5133.26 MiB
############################################

Optimizer step iteration 2
When called the first time, Adam's first and second moments of gradients are initialized and added to GPU memory.
Expected additional GPU usage by Adam moments: 0 MiB
Actual additional GPU usage by Adam moments: 0.00 MiB
############################################
GPU memory statistics
GPU memory occupied:             7578.78 MiB
Reserved GPU memory by torch:    5570.00 MiB
Allocated GPU memory by torch:   5133.26 MiB
############################################

Zero grad iteration 2
Expected released GPU memory by deleting gradients: 1279.25 MiB
Actual released GPU memory by deleting gradients: 1279.25 MiB
############################################
GPU memory statistics
GPU memory occupied:             6002.78 MiB
Reserved GPU memory by torch:    3994.00 MiB
Allocated GPU memory by torch:   3854.01 MiB
############################################

def getTempGradientCalculationMemory(batch_size,
                                     seq_len,
                                     model,
                                     element_size):
    intermediate_size = model.config.intermediate_size
    hidden_size = model.config.hidden_size
    gradient_caluculation_memory = \
        batch_size * seq_len * intermediate_size * element_size + \
        intermediate_size * hidden_size * element_size + \
        2 * batch_size * seq_len * intermediate_size + \
        hidden_size * hidden_size * element_size - \
        batch_size * seq_len * hidden_size
    return gradient_caluculation_memory

def getExpectedMaximumAllocatedMemory(batch_size, seq_len, model, element_size,
                                      expected_memory_model_weights,
                                      cublas_workspace_size):
    expected_total_memory_ffw_tensors = getExpectedTotalMemoryFfwTensors(
        batch_size, seq_len, model, element_size)
    temp_grad_calc_memory = getTempGradientCalculationMemory(
        batch_size, seq_len, model, element_size)
    expected_max_memory_allocated = \
        3 * expected_memory_model_weights + \
        expected_total_memory_ffw_tensors + \
        2 * cublas_workspace_size + \
        temp_grad_calc_memory
    return expected_max_memory_allocated

expected_max_memory_allocated = \
    getExpectedMaximumAllocatedMemory(batch_size,
                                      seq_len,
                                      mdl,
                                      element_size,
                                      expected_memory_model_weights,
                                      cublas_workspace_size)
print(
    f"Expected maximum allocated memory by tensors: "
    f"{expected_max_memory_allocated / 1024**2:.2f} MiB")

Expected maximum allocated memory by tensors: 10562.81 MiB

actual_max_memory_allocated = torch.cuda.max_memory_allocated()
print(
    f"Actual maximum allocated memory by tensors: "
    f"{actual_max_memory_allocated / 1024**2:.2f} MiB")

Actual maximum allocated memory by tensors: 10562.79 MiB

print("\nProportion model weights memory, gradient moments memory and "
      "feed forward tensors memory within maximum allocated memory:")
print(
    f"Proportion feed forward tensors: "
    f"{expected_total_memory_ffw_tensors / expected_max_memory_allocated:.0%}")
print(
    f"Proportion gradient moments: "
    f"{expected_memory_gradient_moments / expected_max_memory_allocated:.0%}")
print(
    f"Proportion model weights: "
    f"{expected_memory_model_weights / expected_max_memory_allocated:.0%}")

Proportion model weights memory, gradient moments memory and feed forward tensors memory within maximum allocated memory:
Proportion feed forward tensors: 63%
Proportion gradient moments: 24%
Proportion model weights: 12%

batch_sizes = list(range(1, 33))
# batch_sizes = [1,2,4,8,16,32]
seq_len = 512
ffw_tensor_memory_list = \
    [getExpectedTotalMemoryFfwTensors(bs, seq_len, mdl, element_size) / 1024**3
     for bs in batch_sizes]
temp_grad_calc_memory_list = \
    [getTempGradientCalculationMemory(bs, seq_len, mdl, element_size) / 1024**3
     for bs in batch_sizes]
data = {"ffw tensors": ffw_tensor_memory_list,
        "gradient moments":
            [expected_memory_gradient_moments / 1024**3] * len(batch_sizes),
        "model weights":
            [expected_memory_model_weights / 1024**3] * len(batch_sizes),
        "default GPU occup.":
            [default_gpu_occupation / 1024**3] * len(batch_sizes),
        "cublas workspace":
            [2 * cublas_workspace_size / 1024**3] * len(batch_sizes),
        "gradient calculation": temp_grad_calc_memory_list}
weightsPerBatchSize = pd.DataFrame(data, index=batch_sizes)
# Create a stacked bar chart
ax = weightsPerBatchSize.plot(kind='bar', stacked=True)
# Adding max GPU memory as horizontal line
max_gpu_memory = torch.cuda.mem_get_info()[1] / 1024**3
ax.axhline(y=max_gpu_memory, color='red', linestyle='--')
# Adding titles and labels
plt.title(f"Max occupied GPU memory per batch size (seq_len={seq_len})")
plt.xlabel('Batch size')
plt.ylabel('Memory (GiB)')
plt.show()

batch_size = 16
sequence_lengths = list(range(128, 513, 16))
# sequence_lengths = [16,32,64,128, 256, 512]
ffw_tensor_memory_list = \
    [getExpectedTotalMemoryFfwTensors(batch_size, sl, mdl, element_size) /
     1024**3 for sl in sequence_lengths]
temp_grad_calc_memory_list = \
    [getTempGradientCalculationMemory(batch_size, sl, mdl, element_size) /
     1024**3 for sl in sequence_lengths]
data = {"ffw tensors": ffw_tensor_memory_list,
        "gradient moments":
            [expected_memory_gradient_moments /
                1024**3] * len(sequence_lengths),
        "model weights":
            [expected_memory_model_weights / 1024**3] * len(sequence_lengths),
        "default GPU occup.":
            [default_gpu_occupation / 1024**3] * len(sequence_lengths),
        "cublas workspace":
            [2 * cublas_workspace_size / 1024**3] * len(sequence_lengths),
        "gradient calculation": temp_grad_calc_memory_list}
weightsPerSequenceLength = pd.DataFrame(data, index=sequence_lengths)
# Create a stacked bar chart
ax = weightsPerSequenceLength.plot(kind='bar', stacked=True)
# Adding max GPU memory as horizontal line
max_gpu_memory = torch.cuda.mem_get_info()[1] / 1024**3
ax.axhline(y=max_gpu_memory, color='red', linestyle='--')
# Adding titles and labels
plt.title(f"Max occupied GPU memory per sequence length (batch_size={batch_size})")
plt.xlabel('Sequence Length')
plt.ylabel('Memory (GiB)')
plt.show()

batch_sizes = [4, 8, 16, 32, 64, 128]
sequence_lengths = list(range(128, 513, 64))
max_memory_lists = \
    [
        [
            getExpectedMaximumAllocatedMemory(bs,
                                              sl,
                                              mdl,
                                              element_size,
                                              expected_memory_model_weights,
                                              cublas_workspace_size) / 1024**3
            for sl in sequence_lengths]
        for bs in batch_sizes]
max_memory_lists = pd.DataFrame(
    max_memory_lists,
    index=batch_sizes,
    columns=sequence_lengths)
# Add efault GPU occupation
max_memory_lists += default_gpu_occupation / 1024**3
threshold = torch.cuda.mem_get_info()[1] / 1024**3

# Function to create the plot with squares,
# including boundary lines and axis labels

def create_enhanced_square_plot(df, threshold, xlabels, ylabels):
    # Determine the size of the plot grid
    nrows, ncols = df.shape

    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(ncols * 2, nrows * 2))

    # Loop through the data and create a square for each cell
    for i in range(nrows):
        for j in range(ncols):
            # Determine the color based on the threshold
            color = 'lightgreen' if df.iloc[i,
                                            j] <= threshold else 'lightcoral'

            # Create a square with a border
            square = patches.Rectangle(
                (j, nrows - i - 1), 1, 1, facecolor=color, edgecolor='black')

            # Add the square to the plot
            ax.add_patch(square)

            # Add the text in the middle of the square
            text = round(df.iloc[i, j], 1)
            ax.text(j + 0.5, nrows - i - 0.5, text,
                    ha='center', va='center', fontsize=12)

    # Set the limits and aspect of the plot
    ax.set_xlim(0, ncols)
    ax.set_ylim(0, nrows)
    ax.set_aspect('equal')

    # Add axis labels
    ax.set_xticks([0.5 + i for i in range(ncols)])
    ax.set_xticklabels(xlabels)
    ax.set_yticks([0.5 + i for i in range(nrows)])
    ax.set_yticklabels(ylabels[::-1])
    ax.set_xlabel("Sequence length")
    ax.set_ylabel("Batch size")

    # Set title
    ax.set_title(
        "Maximum allocated memory with respect to "
        "sequence length and batch size (in GiB)")
    # Show the plot
    plt.show()
    
# Apply the function to the example DataFrame
create_enhanced_square_plot(
    max_memory_lists, threshold, sequence_lengths, batch_sizes)

def memoryPerFfwTensor(batch_size, seq_len, model, element_size):
    expected_memory_per_ffw_tensor = pd.Series(
        getExpectedMemoryPerFfwTensor(batch_size,
                                      seq_len,
                                      model,
                                      element_size)
    )
    # Only consider one encoder layer
    expected_memory_per_ffw_tensor.index = \
        expected_memory_per_ffw_tensor.index.str.replace("layer_0", "layer")
    expected_memory_per_ffw_tensor = \
        expected_memory_per_ffw_tensor[
            ~expected_memory_per_ffw_tensor.index.str.contains(r"layer_\d")
        ]
    # Sort
    expected_memory_per_ffw_tensor.sort_values(ascending=False, inplace=True)
    # Add relative size
    expected_memory_per_ffw_tensor = pd.DataFrame(
        {"Absolute Size":
         expected_memory_per_ffw_tensor / 1024,
         "Relative Size":
         expected_memory_per_ffw_tensor / expected_memory_per_ffw_tensor.sum(),
         })
    # Add cumulated values
    expected_memory_per_ffw_tensor["Cumulated"] = \
        expected_memory_per_ffw_tensor["Relative Size"].cumsum()
    return expected_memory_per_ffw_tensor

expected_memory_per_ffw_tensor = memoryPerFfwTensor(
    batch_size, seq_len, mdl, element_size)
# Convert to percentage format
expected_memory_per_ffw_tensor[["Relative Size", "Cumulated"]] = \
    expected_memory_per_ffw_tensor[["Relative Size", "Cumulated"]].map(
        lambda x: f'{x:.2%}')
print(expected_memory_per_ffw_tensor)

                                         Absolute Size Relative Size Cumulated
layer_attention_self_probs                 262144.0000        21.76%    21.76%
layer_attention_self_scores                262144.0000        21.76%    43.52%
layer_intermediate_gelu                    131072.0000        10.88%    54.40%
layer_intermediate_dense                   131072.0000        10.88%    65.28%
layer_attention_self_probs_dropout          65536.0000         5.44%    70.72%
layer_output_dense                          32768.0000         2.72%    73.44%
layer_attention_output_layer_norm           32768.0000         2.72%    76.16%
embeddings_add                              32768.0000         2.72%    78.88%
layer_attention_output_dense                32768.0000         2.72%    81.60%
layer_attention_self_context                32768.0000         2.72%    84.32%
layer_output_layer_norm                     32768.0000         2.72%    87.04%
layer_attention_self_value_dense            32768.0000         2.72%    89.76%
layer_attention_self_key_dense              32768.0000         2.72%    92.48%
layer_attention_self_query_dense            32768.0000         2.72%    95.20%
embeddings_layer_norm                       32768.0000         2.72%    97.92%
layer_output_dropout                         8192.0000         0.68%    98.60%
embeddings_dropout                           8192.0000         0.68%    99.28%
layer_attention_output_dropout               8192.0000         0.68%    99.96%
input                                          96.0000         0.01%    99.97%
pooler_dense                                   64.0000         0.01%    99.98%
pooler_tanh                                    64.0000         0.01%    99.98%
layer_output_layer_norm_shift                  32.0000         0.00%    99.99%
layer_attention_output_layer_norm_shift        32.0000         0.00%    99.99%
layer_output_layer_norm_bias                   32.0000         0.00%    99.99%
layer_attention_output_layer_norm_bias         32.0000         0.00%    99.99%
embeddings_layer_norm_shift                    32.0000         0.00%   100.00%
embeddings_layer_norm_bias                     32.0000         0.00%   100.00%
dropout                                        16.0000         0.00%   100.00%
target                                          0.0625         0.00%   100.00%
classifier                                      0.0625         0.00%   100.00%

Understand GPU Memory Usage when Training Language Models¶

Introduction¶

Preliminary Work¶

Theoretical considerations¶

Estimating GPU Memory Usage for Model Weights¶

Assessing GPU Memory Requirements for Feedforward Tensors¶

Practical implementation¶

Predicting Maximum GPU Memory Usage¶

Final Thoughts¶