%matplotlib notebook

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

def plot_rectangular(vertices, limits):
    # Define the edges connecting the vertices
    edges = [[vertices[:,i], vertices[:,j]] for i, j in [
        (0, 1), (1, 2), (2, 3), (3, 0),  # bottom face
        (4, 5), (5, 6), (6, 7), (7, 4),  # top face
        (0, 4), (1, 5), (2, 6), (3, 7)   # side faces
    ]]
    
    # Create a 3D plot
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot the edges
    for edgeIdx, edge in enumerate(edges):
        if edgeIdx < 4:
            ax.plot(*zip(*edge), color='darkblue')
        elif edgeIdx > 7:
            ax.plot(*zip(*edge), color='gold')
        else:
            ax.plot(*zip(*edge), color='darkred')
            
    # Set plot display parameters
    ax.set_xlabel('X axis')
    ax.set_ylabel('Y axis')
    ax.set_zlabel('Z axis')
    ax.set_xlim(limits)
    ax.set_ylim(limits)
    ax.set_zlim(limits)
    plt.show()

limits = (-2,3)
vertices_3d_unit_cube = np.array([[0, 0, 0],
                     [1, 0, 0],
                     [1, 1, 0],
                     [0, 1, 0],
                     [0, 0, 1],
                     [1, 0, 1],
                     [1, 1, 1],
                     [0, 1, 1]])
# Transpose, as datapoints should be gicen as column vectors
vertices_3d_unit_cube = np.transpose(vertices_3d_unit_cube)
# Plot
plot_rectangular(vertices_3d_unit_cube,limits)

# Angle for rotation
angle_deg = 45
angle_rad = np.radians(angle_deg)

# Rotation matrix for 45° rotation around z-axis
rotation_matrix_z_45 = np.array([
    [np.cos(angle_rad), -np.sin(angle_rad), 0],
    [np.sin(angle_rad), np.cos(angle_rad), 0],
    [0, 0, 1]
])

# Rotation matrix for 45° rotation around y-axis
rotation_matrix_y_45 = np.array([
    [np.cos(angle_rad), 0, np.sin(angle_rad)],
    [0, 1, 0],
    [-np.sin(angle_rad), 0, np.cos(angle_rad)]
])


# Scaling all axes
scaling_matrix = np.diag([1.2,1.4,0.8])

# Create M
M = rotation_matrix_z_45.dot(rotation_matrix_y_45).dot(scaling_matrix)
# Transform unit cube
vertices_transformed = M.dot(vertices_3d_unit_cube)
# Plot
plot_rectangular(vertices_transformed,limits)

# Do singular value decompositon
U, S, Vt = np.linalg.svd(M)

# S is given as an array of the singular values. Make it a diagonal matrix
S = np.diag(S)

# Print SVD decomposition
print("U:")
print(U)
print("\nSigma:")
print(S)
print("\nV^T")
print(Vt)

# Check whether M is indeed equal to U*S*Vt by calculating the sum of the absolute elementwise differences
print(f"\nSum of elementwise absolute differences between M and its SVD decompositon: {np.sum(np.abs(M-U.dot(S).dot(Vt)))}")

U:
[[ 7.07106781e-01 -5.00000000e-01 -5.00000000e-01]
 [-7.07106781e-01 -5.00000000e-01 -5.00000000e-01]
 [-5.55111512e-17  7.07106781e-01 -7.07106781e-01]]

Sigma:
[[1.4 0.  0. ]
 [0.  1.2 0. ]
 [0.  0.  0.8]]

V^T
[[-0. -1.  0.]
 [-1. -0.  0.]
 [-0.  0. -1.]]

Sum of elementwise absolute differences between M and its SVD decompositon: 7.993605777301127e-16

vertices_transformed_Vt = Vt.dot(vertices_3d_unit_cube)
plot_rectangular(vertices_transformed_Vt,limits)

vertices_transformed_S = S.dot(vertices_transformed_Vt)
plot_rectangular(vertices_transformed_S,limits)

vertices_transformed_U = U.dot(vertices_transformed_S)
plot_rectangular(vertices_transformed_U,limits)

def approx_matrix_SVD(M):
    # Computing the Singular Value Decomposition (SVD) of the matrix M
    U, S, Vt = np.linalg.svd(M)
    # Recreate M, but without using the last singular value
    M_approx = U[:,:-1].dot(np.diag(S[:-1])).dot(Vt[:-1,:])
    
    return M_approx

M = np.array([[1, 0, 0], 
              [0, 1, 0], 
              [0, 0, 0.3]])
vertices_transformed_ex1 = M.dot(vertices_3d_unit_cube)
vertices_approx_transformed_ex1 = approx_matrix_SVD(M).dot(vertices_3d_unit_cube)

limits=(0,1)
print("Original transformation of the 3D unit cube:")
plot_rectangular(vertices_transformed_ex1,limits)
print("Approximated transformation of the 3D unit cube:")
plot_rectangular(vertices_approx_transformed_ex1,limits)

Original transformation of the 3D unit cube:

Approximated transformation of the 3D unit cube:

M = np.array([[1, 0, 0], 
              [0, 0.3, 0], 
              [0, 0, 1]])
vertices_transformed_ex2 = M.dot(vertices_3d_unit_cube)
vertices_approx_transformed_ex2 = approx_matrix_SVD(M).dot(vertices_3d_unit_cube)

limits=(0,1)
print("Original transformation of the 3D unit cube:")
plot_rectangular(vertices_transformed_ex2,limits)
print("Approximated transformation of the 3D unit cube:")
plot_rectangular(vertices_approx_transformed_ex2,limits)

Original transformation of the 3D unit cube:

Approximated transformation of the 3D unit cube:

M = np.array([[1, 0, 0], 
              [0, 1, 0], 
              [0, 0, 0.05]])
vertices_transformed_ex3_1 = M.dot(vertices_3d_unit_cube)
vertices_approx_transformed_ex3_1 = approx_matrix_SVD(M).dot(vertices_3d_unit_cube)

limits=(0,1)
print("Original transformation of the 3D unit cube:")
plot_rectangular(vertices_transformed_ex3_1,limits)
print("Approximated transformation of the 3D unit cube:")
plot_rectangular(vertices_approx_transformed_ex3_1,limits)

Original transformation of the 3D unit cube:

Approximated transformation of the 3D unit cube:

M = np.array([[1, 0, 0], 
              [0, 1, 0], 
              [0, 0, 0.8]])
vertices_transformed_ex3_2 = M.dot(vertices_3d_unit_cube)
vertices_approx_transformed_ex3_2 = approx_matrix_SVD(M).dot(vertices_3d_unit_cube)

limits=(0,1)
print("Original transformation of the 3D unit cube:")
plot_rectangular(vertices_transformed_ex3_2,limits)
print("Approximated transformation of the 3D unit cube:")
plot_rectangular(vertices_approx_transformed_ex3_2,limits)

Original transformation of the 3D unit cube:

Approximated transformation of the 3D unit cube:

M = np.array([[1, 2, 3], 
              [3, 1, 2], 
              [2, 3, 1]])

# Apply transformations
vertices_transformed_ex4= M.dot(vertices_3d_unit_cube)
vertices_approx_transformed_ex4 = approx_matrix_SVD(M).dot(vertices_3d_unit_cube)

# Plot
limits=(0,6)
print("Original transformation of the 3D unit cube:")
plot_rectangular(vertices_transformed_ex4,limits)
print("Approximated transformation of the 3D unit cube:")
plot_rectangular(vertices_approx_transformed_ex4,limits)

Original transformation of the 3D unit cube:

Approximated transformation of the 3D unit cube:

# Apply SVD
U, S, Vt = np.linalg.svd(M)
# To change the basis, we need to multiply with the inverse a matrix whose columns are the new basis vectors
V_inv = np.linalg.inv(np.transpose(Vt))
U_inv = np.linalg.inv(U)

# Before transformation, change basis to V
vertices_unit_cube_base_V = V_inv.dot(vertices_3d_unit_cube)
# Transform with M and approximated M
vertices_base_V_transformed = M.dot(vertices_unit_cube_base_V)
vertices_base_V_approx_transformed = approx_matrix_SVD(M).dot(vertices_unit_cube_base_V)
# After transformation, change basis to U
vertices_base_U_transformed = U_inv.dot(vertices_base_V_transformed)
vertices_base_U_approx_transformed = U_inv.dot(vertices_base_V_approx_transformed)

# Plot
print("Original transformation of the 3D unit cube in base of U:")
plot_rectangular(vertices_base_U_transformed,limits)
print("Approximated transformation of the 3D unit cube in base of U:")
plot_rectangular(vertices_base_U_approx_transformed,limits)

Original transformation of the 3D unit cube in base of U:

Approximated transformation of the 3D unit cube in base of U:

import torch
import evaluate
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset, load_metric
import numpy as np

def approx_matrix_SVD(M , rho = 0.995):
    r = round((1-rho) * min(M.shape))
    # Computing the Singular Value Decomposition (SVD) of the matrix M
    U, S, Vt = torch.linalg.svd(M)
    # Recreate M, but only using first r singular values
    M_approx = U[:,:r] @ torch.diag(S[:r]) @ Vt[:r,:]
    
    return M_approx

def evaluate_model(model, tokenizer, dataset_name, batch_size=8, split='validation'):
    # Load dataset and metric
    dataset = load_dataset("glue", dataset_name)
    metric = evaluate.load('glue', dataset_name)

    # Tokenize the input texts
    def tokenize(batch):
        return tokenizer(batch['sentence'], padding=True, truncation=True)

    dataset[split] = dataset[split].map(tokenize, batched=True)
    dataset[split].set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    # DataLoader
    dataloader = DataLoader(dataset[split], batch_size=batch_size)

    # Evaluation
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            predictions = torch.argmax(logits, axis=-1)
            metric.add_batch(predictions=predictions, references=labels)

    final_score = metric.compute()
    return final_score['accuracy']

# Setting up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Adjust the batch size as needed
batch_size = 16  # Example batch size

# Model and Dataset
model_name = 'assemblyai/bert-large-uncased-sst2'
dataset_name = 'sst2'

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name).to(device)

Using device: cuda

# Run without LASER
model = BertForSequenceClassification.from_pretrained(model_name).to(device)
val_score_without_laser = evaluate_model(model, tokenizer, dataset_name)
print(f"Validation score without laser: {val_score_without_laser:.2%}")

Validation score without laser: 92.89%

# Run with LASER

layers = list(range(24)) # Transformer block index (0-based)
rhos = [0.1,0.5,0.9]
layer_num = []
rho_val = []
accuracy = []
for layer in layers:
    for rho in rhos:
        model_laser = BertForSequenceClassification.from_pretrained(model_name).to(device)
        # Apply LASER
        model_laser.bert.encoder.layer[layer].output.dense.weight.data = approx_matrix_SVD(model_laser.bert.encoder.layer[layer].output.dense.weight.data,  rho = 0.5)
        layer_num.append(layer) # Track current layer
        rho_val.append(rho) # Track current rho
        accuracy.append(evaluate_model(model_laser, tokenizer, dataset_name)) # Track acutal accuracy

# Print laser results
import pandas as pd

results_laser = pd.DataFrame({"Layer" : layer_num, "Rho" : rho_val, "Accuracy" : accuracy})
results_laser.sort_values(by=['Accuracy'], inplace = True, ignore_index = True, ascending=False)
print(results_laser.head(10))

   Layer  Rho  Accuracy
0      1  0.1  0.932339
1      1  0.5  0.932339
2      1  0.9  0.932339
3      4  0.1  0.931193
4     21  0.9  0.931193
5     21  0.5  0.931193
6     21  0.1  0.931193
7      4  0.5  0.931193
8      4  0.9  0.931193
9      5  0.5  0.930046

Intuition Behind Singular Value Decomposition (SVD)¶

1. Matrix as Transformation:¶

2. Decomposing Transformation with SVD:¶

3. Approximating $M$ by Removing Smaller Singular Values¶

Why is LASER working when applied to language models?¶

Simple Application: BERT-large Performance on SST-2 with and without LASER¶