In [1]:
# %pip install gputil
# %pip install setuptools
# %pip install transformers
# %pip install torch

# %pip install auto-gptq #==0.4.0

In [1]:
import GPUtil

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
# from auto_gptq import AutoGPTQForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
gpus = GPUtil.getGPUs()
if not gpus:
    print("No GPU detected on this system.")
else:
    for gpu in gpus:
        print(f"GPU Name: {gpu.name}")
        print(f"Total VRAM: {gpu.memoryTotal} MB")
        print(f"Free VRAM: {gpu.memoryFree} MB")
        print(f"Used VRAM: {gpu.memoryUsed} MB")
        print("-" * 40)

No GPU detected on this system.


In [2]:
def grab_model(model_name, quantized = False):
    if quantized:
        model = AutoGPTQForCausalLM.from_quantized(model_name, device="cpu", use_safetensors=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [63]:
modelA, tokenizerA = grab_model("gpt2")
modelB, tokenizerB = grab_model("EleutherAI/gpt-neo-125M")

# modelA, tokenizerA = grab_model("EleutherAI/gpt-neo-125M-4bit", quantized=True)
# modelB, tokenizerB = grab_model("iproskurina/opt-125m-GPTQ-4bit-g128", quantized=True)

In [5]:
modelA.config.hidden_size == modelB.config.hidden_size 

True

In [60]:
# replace tokenizer:
# modelA.Tokenizer = tokenizerB   # optional when not accessing directly 

# replace token embeddings for input and output:
# modelA.set_input_embeddings(modelB.get_input_embeddings())
# modelA.lm_head.weight = modelB.get_input_embeddings().weight
# modelA.resize_token_embeddings(tokenizerB.vocab_size)

# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight



In [None]:
# emb1 = modelA.get_input_embeddings().weight
# emb2 = modelB.get_input_embeddings().weight

# print("ModelA mean norms:", torch.norm(emb1, dim=1).mean().item())
# print("ModelB mean norms:", torch.norm(emb2, dim=1).mean().item())

# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()

# print(scaling_factor)

# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)

# print("new_embedding mean norms:", torch.norm(new_embedding.weight, dim=1).mean().item())

# modelA.set_input_embeddings(new_embedding)
# modelA.lm_head.weight = new_embedding.weight


In [113]:
print(torch.isnan(modelB.get_input_embeddings().weight).any())
print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())
print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())

tensor(False)
tensor(22.2842, grad_fn=<MaxBackward1>)
tensor(11.5013, grad_fn=<MeanBackward0>)


In [4]:
def check_orthogonal(R):
    I = torch.eye(R.size(0), device=R.device)
    delta = torch.norm(R.T @ R - I)
    print(f"Delta: {delta:.6e}")
    

In [8]:
# use proscrustes:
def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    # A_centered = A - A.mean(dim=0, keepdim=True)
    # B_centered = B - B.mean(dim=0, keepdim=True)

    #M = B_centered.T @ A_centered
    M = B.T @ A
    # find optimal rotation with svd
    U, _, Vt = torch.linalg.svd(M)

    # get rotation matrix that aligns B to A
    R = U @ Vt

    check_orthogonal(R)
    
    return R # return rotated tensor





In [None]:
emb1 = modelA.get_input_embeddings().weight
emb2 = modelB.get_input_embeddings().weight

# get rotation matrix
R = procrustes(emb2, emb1)
emb1_R = emb1 @ R

new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)

modelA.set_input_embeddings(new_embedding)
modelA.lm_head.weight = new_embedding.weight

# def rotate_weight(W, R):
#     if W.shape[1] == R.shape[0]:
#         return W @ R
#     if W.shape[0] == R.shape[0]:
#         return R.T @ W

# now fix the other layers by conjugating:
# for block in modelA.transformer.h:
#     for M in [block.attn.c_attn, block.mlp.c_fc]:
#         W = M.weight.data
#         W[:] = R.T @ W
#     for M in [block.attn.c_proj, block.mlp.c_proj]:
#         W = M.weight.data
#         W[:] = R.T @ W @ R

def split_rotate_concat(W):
    parts1 = [x for x in W.split(768, dim=1)]
    for i, v in enumerate(parts1):
        parts2 = [x for x in v.split(768, dim=0)]
        for j, w in enumerate(parts2):
            parts2[j] = R.T @ w @ R
        parts1[i] = torch.cat(parts2, dim=0)
    return torch.cat(parts1, dim=1)


def rotate_layernorm(ln):
    ln.weight.data[:] = ln.weight.data @ R
    ln.bias.data[:] = ln.bias.data @ R

for block in modelA.transformer.h:
    # print(block.attn.c_attn.weight.data.shape)
    # print(block.mlp.c_fc.weight.data.shape)
    # print(block.attn.c_proj.weight.data.shape)
    # print(block.mlp.c_proj.weight.data.shape)
    # block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data.T).T
    # block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data.T).T
    block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data)
    block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data)
    block.attn.c_proj.weight.data[:] = split_rotate_concat(block.attn.c_proj.weight.data)
    block.mlp.c_proj.weight.data[:] = split_rotate_concat(block.mlp.c_proj.weight.data)
    rotate_layernorm(block.ln_1)
    rotate_layernorm(block.ln_2)

rotate_layernorm(modelA.transformer.ln_f)


print(modelA.transformer.wpe.weight.data.shape)
modelA.transformer.wpe.weight.data[:] = modelA.transformer.wpe.weight.data @ R

    # for name in ['c_attn', 'c_proj']:
    #     W = getattr(block.attn, name).weight.data
    #     W[:] = R.T @ W @ R
    # w1 = block.mlp.c_fc.weight.data
    # w2 = block.mlp.c_proj.weight.data
    # w1[:] = R.T @ W1 @ R
    # w2[:] = R.T @ W2 @ R
    


Delta: 6.706436e-05
torch.Size([1024, 768])


In [None]:
# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight

In [None]:
# modelA.lm_head.out_features == tokenizerA.vocab_size

True

Text:

With it:


In [None]:
# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.


Text:

In [66]:
# use model
pipe = pipeline("text-generation", model=modelA, tokenizer=tokenizerB)
print(pipe("Hello, how are you?"))

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, how are you?orm Coulormormorm Coulorm Coul Coulorm Coul Coulinion Coulorm Coulonomousonomous Coulonomousonomousonomousonomousonomousormonomous Coulorm Coulonomousonomousonomous Coulonomousonomous Coulonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Amenonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomous Coulonomousonomousonomousonomousonomousonomousonomoushered…] Coulonomousonomousonomous Amenomniaifulonomousonomouskeleyifulonomous Amenomniaifulhered Amenkeleyomniastad Coulonomousifulifulomniaifulomnia

In [26]:
modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()

True

In [27]:
modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()

True

In [10]:
tok = tokenizerA("This is a test and i wonder why the tokenizers are the same", return_tensors = "pt")
print(tok.input_ids[0])

tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,
        11341,   389,   262,   976])


In [11]:
tok = tokenizerB("This is a test and i wonder why the tokenizers are the same", return_tensors = "pt")
print(tok.input_ids[0])

tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,
        11341,   389,   262,   976])
