In [1]:
# %pip install gputil
# %pip install setuptools
# %pip install transformers
# %pip install torch

# %pip install auto-gptq #==0.4.0

In [12]:
import GPUtil

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
# from auto_gptq import AutoGPTQForCausalLM

In [7]:
gpus = GPUtil.getGPUs()
if not gpus:
    print("No GPU detected on this system.")
else:
    for gpu in gpus:
        print(f"GPU Name: {gpu.name}")
        print(f"Total VRAM: {gpu.memoryTotal} MB")
        print(f"Free VRAM: {gpu.memoryFree} MB")
        print(f"Used VRAM: {gpu.memoryUsed} MB")
        print("-" * 40)

No GPU detected on this system.


In [3]:
def grab_model(model_name, quantized = False):
    if quantized:
        model = AutoGPTQForCausalLM.from_quantized(model_name, device="cpu", use_safetensors=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [110]:
modelA, tokenizerA = grab_model("gpt2")
modelB, tokenizerB = grab_model("EleutherAI/gpt-neo-125M")

# modelA, tokenizerA = grab_model("EleutherAI/gpt-neo-125M-4bit", quantized=True)
# modelB, tokenizerB = grab_model("iproskurina/opt-125m-GPTQ-4bit-g128", quantized=True)

In [5]:
modelA.config.hidden_size == modelB.config.hidden_size 

True

In [60]:
# replace tokenizer:
# modelA.Tokenizer = tokenizerB   # optional when not accessing directly 

# replace token embeddings for input and output:
# modelA.set_input_embeddings(modelB.get_input_embeddings())
# modelA.lm_head.weight = modelB.get_input_embeddings().weight
# modelA.resize_token_embeddings(tokenizerB.vocab_size)

# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight



In [None]:
# emb1 = modelA.get_input_embeddings().weight
# emb2 = modelB.get_input_embeddings().weight

# print("ModelA mean norms:", torch.norm(emb1, dim=1).mean().item())
# print("ModelB mean norms:", torch.norm(emb2, dim=1).mean().item())

# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()

# print(scaling_factor)

# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)

# print("new_embedding mean norms:", torch.norm(new_embedding.weight, dim=1).mean().item())

# modelA.set_input_embeddings(new_embedding)
# modelA.lm_head.weight = new_embedding.weight


In [113]:
print(torch.isnan(modelB.get_input_embeddings().weight).any())
print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())
print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())

tensor(False)
tensor(22.2842, grad_fn=<MaxBackward1>)
tensor(11.5013, grad_fn=<MeanBackward0>)


In [68]:
def check_orthogonal(R):
    I = torch.eye(R.size(0), device=R.device)
    delta = torch.norm(R.T @ R - I)
    print(f"Delta: {delta:.6e}")
    

In [111]:
# use proscrustes:
def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    # A_centered = A - A.mean(dim=0, keepdim=True)
    # B_centered = B - B.mean(dim=0, keepdim=True)

    #M = B_centered.T @ A_centered
    M = B.T @ A
    # find optimal rotation with svd
    U, _, Vt = torch.linalg.svd(M)

    # get rotation matrix that aligns B to A
    R = U @ Vt

    check_orthogonal(R)
    
    return B @ R # return rotated tensor

def get_rotated_matrix(A, B, n = 1000):
    # use only the first n tokens for rotation:
    # return procrustes(A[:n], B[:n])
    return procrustes(A, B)
    




In [None]:
emb1 = modelA.get_input_embeddings().weight
emb2 = modelB.get_input_embeddings().weight

emb2_R = get_rotated_matrix(emb1, emb2)

print("ModelA mean norms:", torch.norm(emb1, dim=1).mean().item())
print("ModelB mean norms:", torch.norm(emb2, dim=1).mean().item())
print("Rotated modelB mean norms:", torch.norm(emb2_R, dim=1).mean().item())

scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()

print(scaling_factor)

new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)
# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R)


print("new_embedding mean norms:", torch.norm(new_embedding.weight, dim=1).mean().item())

modelA.set_input_embeddings(new_embedding)
modelA.lm_head.weight = new_embedding.weight


Delta: 6.659338e-05
ModelA mean norms: 3.9585366249084473
ModelB mean norms: 11.50130844116211
Rotated modelB mean norms: 11.501314163208008
0.3441812447460622
new_embedding mean norms: 11.501314163208008


In [107]:
modelA.transformer.wpe.weight = modelB.transformer.wpe.weight

In [109]:
modelA.lm_head.out_features == tokenizerA.vocab_size

True

Text:

With it:


In [None]:
# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.


Text:

In [114]:
# use model
pipe = pipeline("text-generation", model=modelA, tokenizer=tokenizerB)
print(pipe("Hello, how are you?"))

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, how are you? the he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he'}]


In [26]:
modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()

True

In [27]:
modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()

True

In [10]:
tok = tokenizerA("This is a test and i wonder why the tokenizers are the same", return_tensors = "pt")
print(tok.input_ids[0])

tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,
        11341,   389,   262,   976])


In [11]:
tok = tokenizerB("This is a test and i wonder why the tokenizers are the same", return_tensors = "pt")
print(tok.input_ids[0])

tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,
        11341,   389,   262,   976])
