In [1]:
# %pip install gputil
# %pip install setuptools
# %pip install transformers
# %pip install torch

# %pip install auto-gptq #==0.4.0

What happens when you rescale the input and output embeddings?

In [1]:
import GPUtil

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
# from auto_gptq import AutoGPTQForCausalLM

 from .autonotebook import tqdm as notebook_tqdm


In [7]:
gpus = GPUtil.getGPUs()
if not gpus:
 print("No GPU detected on this system.")
else:
 for gpu in gpus:
 print(f"GPU Name: {gpu.name}")
 print(f"Total VRAM: {gpu.memoryTotal} MB")
 print(f"Free VRAM: {gpu.memoryFree} MB")
 print(f"Used VRAM: {gpu.memoryUsed} MB")
 print("-" * 40)

No GPU detected on this system.


In [2]:
def grab_model(model_name, quantized = False):
 if quantized:
 model = AutoGPTQForCausalLM.from_quantized(model_name, device="cpu", use_safetensors=True)
 else:
 model = AutoModelForCausalLM.from_pretrained(model_name)

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 return model, tokenizer

In [3]:
modelA, tokenizerA = grab_model("gpt2")
modelB, tokenizerB = grab_model("EleutherAI/gpt-neo-125M")

# modelA, tokenizerA = grab_model("EleutherAI/gpt-neo-125M-4bit", quantized=True)
# modelB, tokenizerB = grab_model("iproskurina/opt-125m-GPTQ-4bit-g128", quantized=True)

In [5]:
modelA.config.hidden_size == modelB.config.hidden_size 

True

In [113]:
print(torch.isnan(modelB.get_input_embeddings().weight).any())
print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())
print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())

tensor(False)
tensor(22.2842, grad_fn=)
tensor(11.5013, grad_fn=)


In [4]:
def check_orthogonal(R):
 I = torch.eye(R.size(0), device=R.device)
 delta = torch.norm(R.T @ R - I)
 print(f"Delta: {delta:.6e}")
 

In [None]:
emb1 = modelA.get_input_embeddings().weight
emb2 = modelB.get_input_embeddings().weight

print("ModelA mean norms:", torch.norm(emb1, dim=1).mean().item())
print("ModelB mean norms:", torch.norm(emb2, dim=1).mean().item())

scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()

print("Scaling factor: ", scaling_factor)

new_embedding = torch.nn.Embedding.from_pretrained(emb1/scaling_factor)

print("new_embedding mean norms:", torch.norm(new_embedding.weight, dim=1).mean().item())

modelA.set_input_embeddings(new_embedding)
modelA.lm_head.weight = new_embedding.weight


ModelA mean norms: 1.3624546527862549
ModelB mean norms: 11.50130844116211
0.1184608394563315
new_embedding mean norms: 11.50130844116211


In [109]:
modelA.lm_head.out_features == tokenizerA.vocab_size

True

In [12]:
# use model
pipe = pipeline("text-generation", model=modelA, tokenizer=tokenizerB)
print(pipe("Hello, how are you?"))

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, how are you?\n\nYou are not a new.\n\nYou are a new.\n\n\nYou are not a new.\n\n\nYou are not a new.\n\n\na new.\n\na new.\n\na.\n\na.\n\na.\n\na.\n\na.\n\na.\n\na.\n\na.\n\na\n\na.\n\na\n\n.\na\n\na\n\na\n\n.\na\n\na\n\n.\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\n.\n\na\n\n.\n\na\n\n.\n\na\n\n.\n\n.\n\na\n\n.\n\na\n\n.\n\n.\n\na\n\n.\n\na\n\n.\n\n'}]
