added some comments and cleanup (still not great)

This commit is contained in:
Mick Walter 2025-06-19 17:40:23 +02:00
parent 6a0e3df7f2
commit 584776b39a
7 changed files with 125 additions and 1036 deletions

View File

@ -1,3 +1,19 @@
# Frankenstein
embedding hackathon repo
Embedding hackathon repo
Notebook embedding.ipynb:
Try to swap components between 2 LMM models. Recommended to start.
Has more comments and helper functions.
Notebooks rotation, scaling and reflection.ipynb:
Only attempt an operation on the input and output embeddings of one model.
Does it break the model? Or is it invariant?
Notebook rotation_fixed.ipynb:
This notebook then attempts to rotate the entire model. So all weights in all transformer layers, etc.
This is not as easy as it sounds, and highly model specific: different models have very different internal layers and representations. Layers may have different shapes, or are concatenated (such as the kvq matrices).

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "e28fb85c",
"metadata": {},
"outputs": [],
@ -15,6 +15,17 @@
"# %pip install auto-gptq #==0.4.0"
]
},
{
"cell_type": "markdown",
"id": "01f3cd1d",
"metadata": {},
"source": [
"This notebook attempts to open up 2 LMM models, and swap out the input and output embeddings and tokenizer.\n",
"It then runs the model to see if the output still makes sense (hint: actually I dont think you need a hint)\n",
"\n",
"Be mindful of the difference between weights[:]= and weights= (I probably messed that up pretty often)"
]
},
{
"cell_type": "code",
"execution_count": 12,
@ -89,7 +100,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "1da291ed",
"metadata": {},
"outputs": [
@ -105,20 +116,27 @@
}
],
"source": [
"# Check if the dimensionality is identical\n",
"modelA.config.hidden_size == modelB.config.hidden_size "
]
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": null,
"id": "dcfc2d85",
"metadata": {},
"outputs": [],
"source": [
"# replace tokenizer:\n",
"# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n",
"## ATTEMPT 1: \n",
"## replace input and output embeddings (tied), \n",
"## optionally resize token embeddings (probably not needed, definitely not needed if tokenizer is the same)\n",
"## optionally also swap the positional encoding (wpe) \n",
"\n",
"# replace token embeddings for input and output:\n",
"\n",
"### replace tokenizer:\n",
"## modelA.Tokenizer = tokenizerB # not necessary: the pipeline will not access this directly anyway\n",
"\n",
"## replace token embeddings for input and output:\n",
"# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
"# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
"# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
@ -134,6 +152,9 @@
"metadata": {},
"outputs": [],
"source": [
"## ATTEMPT 2:\n",
"## add a scaling factor, attempting to normalize weight magnitude to match the other model better\n",
"\n",
"# emb1 = modelA.get_input_embeddings().weight\n",
"# emb2 = modelB.get_input_embeddings().weight\n",
"\n",
@ -154,7 +175,7 @@
},
{
"cell_type": "code",
"execution_count": 113,
"execution_count": null,
"id": "c62b2f41",
"metadata": {},
"outputs": [
@ -169,6 +190,8 @@
}
],
"source": [
"# check the max and mean values (should not be too big) and nan values (should not occur)\n",
"\n",
"print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
"print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
"print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
@ -176,11 +199,12 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": null,
"id": "2b9893a3",
"metadata": {},
"outputs": [],
"source": [
"# optional check if the resulting rotational matrix is an orthogonal rotation matrix (it should be, within e-5 ish)\n",
"def check_orthogonal(R):\n",
" I = torch.eye(R.size(0), device=R.device)\n",
" delta = torch.norm(R.T @ R - I)\n",
@ -190,18 +214,22 @@
},
{
"cell_type": "code",
"execution_count": 111,
"execution_count": null,
"id": "e1a54c24",
"metadata": {},
"outputs": [],
"source": [
"# use proscrustes:\n",
"def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
" # center the model - recommended not to do this.. \n",
" # rotational matrix may improve but the shift kills everything\n",
" # if you do this, probably should shift it back afterwards again in the rotated frame.\n",
" # A_centered = A - A.mean(dim=0, keepdim=True)\n",
" # B_centered = B - B.mean(dim=0, keepdim=True)\n",
"\n",
" #M = B_centered.T @ A_centered\n",
" M = B.T @ A\n",
"\n",
" # find optimal rotation with svd\n",
" U, _, Vt = torch.linalg.svd(M)\n",
"\n",
@ -210,11 +238,12 @@
"\n",
" check_orthogonal(R)\n",
" \n",
" return B @ R # return rotated tensor\n",
" return B @ R # return rotated tensor B\n",
"\n",
"def get_rotated_matrix(A, B, n = 1000):\n",
" # use only the first n tokens for rotation:\n",
" # return procrustes(A[:n], B[:n])\n",
" # or use all, if the model is small enough (it's usually fine, and a badly rotated matrix causes problems):\n",
" return procrustes(A, B)\n",
" \n",
"\n",
@ -241,6 +270,9 @@
}
],
"source": [
"# THIRD ATTEMPT:\n",
"# try aligning with Procrustes first before swapping. Still also uses rescaling.\n",
"\n",
"emb1 = modelA.get_input_embeddings().weight\n",
"emb2 = modelB.get_input_embeddings().weight\n",
"\n",
@ -266,17 +298,18 @@
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": null,
"id": "9b671b41",
"metadata": {},
"outputs": [],
"source": [
"# optionally swap the position encoding weights.\n",
"modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
]
},
{
"cell_type": "code",
"execution_count": 109,
"execution_count": null,
"id": "85957357",
"metadata": {},
"outputs": [
@ -292,6 +325,7 @@
}
],
"source": [
"# check if the output shape matches the tokenizer vocab size (it should)\n",
"modelA.lm_head.out_features == tokenizerA.vocab_size"
]
},
@ -318,7 +352,10 @@
"metadata": {},
"outputs": [],
"source": [
"# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
"# make extra sure the input and output weights are tied (if you don't trust that they are in your model)\n",
"# usually this should change nothing: they are often even the same object if they are tied.\n",
"# plus we already set this explicitly before, even if it was probably not needed.\n",
"# modelA.lm_head.weight = modelA.get_input_embeddings().weight \n"
]
},
{
@ -375,7 +412,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"id": "79616f5c",
"metadata": {},
"outputs": [
@ -391,12 +428,13 @@
}
],
"source": [
"# extra check that input and output weights match\n",
"modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"id": "7fc76499",
"metadata": {},
"outputs": [
@ -412,12 +450,13 @@
}
],
"source": [
"# extra check that input and output weights match\n",
"modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "7d51d201",
"metadata": {},
"outputs": [
@ -431,6 +470,8 @@
}
],
"source": [
"# print the token IDs for the given string -> turns out these gpt2 and neo use the same tokenizer\n",
"# thats why swapping them in the pipeline had no effect at all (by itself should break model unless identical) \n",
"tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]

View File

@ -15,6 +15,14 @@
"# %pip install auto-gptq #==0.4.0"
]
},
{
"cell_type": "markdown",
"id": "f16a013c",
"metadata": {},
"source": [
"What happens when you reflect or invert the input and output embeddings?"
]
},
{
"cell_type": "code",
"execution_count": 1,
@ -38,33 +46,6 @@
"# from auto_gptq import AutoGPTQForCausalLM"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0273f299",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No GPU detected on this system.\n"
]
}
],
"source": [
"gpus = GPUtil.getGPUs()\n",
"if not gpus:\n",
" print(\"No GPU detected on this system.\")\n",
"else:\n",
" for gpu in gpus:\n",
" print(f\"GPU Name: {gpu.name}\")\n",
" print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
" print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
" print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
" print(\"-\" * 40)"
]
},
{
"cell_type": "code",
"execution_count": 2,
@ -117,119 +98,6 @@
"modelA.config.hidden_size == modelB.config.hidden_size "
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "dcfc2d85",
"metadata": {},
"outputs": [],
"source": [
"# replace tokenizer:\n",
"# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n",
"\n",
"# replace token embeddings for input and output:\n",
"# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
"# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
"# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
"\n",
"# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1011d3ad",
"metadata": {},
"outputs": [],
"source": [
"# emb1 = modelA.get_input_embeddings().weight\n",
"# emb2 = modelB.get_input_embeddings().weight\n",
"\n",
"# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
"# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
"\n",
"# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
"\n",
"# print(scaling_factor)\n",
"\n",
"# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
"\n",
"# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
"\n",
"# modelA.set_input_embeddings(new_embedding)\n",
"# modelA.lm_head.weight = new_embedding.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "c62b2f41",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor(False)\n",
"tensor(22.2842, grad_fn=<MaxBackward1>)\n",
"tensor(11.5013, grad_fn=<MeanBackward0>)\n"
]
}
],
"source": [
"print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
"print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
"print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2b9893a3",
"metadata": {},
"outputs": [],
"source": [
"def check_orthogonal(R):\n",
" I = torch.eye(R.size(0), device=R.device)\n",
" delta = torch.norm(R.T @ R - I)\n",
" print(f\"Delta: {delta:.6e}\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e1a54c24",
"metadata": {},
"outputs": [],
"source": [
"# use proscrustes:\n",
"def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
" # A_centered = A - A.mean(dim=0, keepdim=True)\n",
" # B_centered = B - B.mean(dim=0, keepdim=True)\n",
"\n",
" #M = B_centered.T @ A_centered\n",
" M = B.T @ A\n",
" # find optimal rotation with svd\n",
" U, _, Vt = torch.linalg.svd(M)\n",
"\n",
" # get rotation matrix that aligns B to A\n",
" R = U @ Vt\n",
"\n",
" check_orthogonal(R)\n",
" \n",
" return B @ R # return rotated tensor\n",
"\n",
"def get_rotated_matrix(A, B, n = 1000):\n",
" # use only the first n tokens for rotation:\n",
" # return procrustes(A[:n], B[:n])\n",
" return procrustes(A, B)\n",
" \n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
@ -243,7 +111,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": null,
"id": "ff93495e",
"metadata": {},
"outputs": [
@ -259,41 +127,20 @@
],
"source": [
"\n",
"\n",
"# emb1_R = get_rotated_matrix(emb2, emb1)\n",
"\n",
"#emb1_R = emb1 * -1\n",
"# emb1_R = emb1.T\n",
"# flip the matrix\n",
"emb1_R = emb1.flip(dims=[0]) # .reverse() #[::-1]\n",
"\n",
"print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
"# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
"print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
"print(\"Rotated modelA mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
"\n",
"# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n",
"\n",
"# print(scaling_factor)\n",
"\n",
"# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
"new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n",
"\n",
"\n",
"print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
"\n",
"modelA.set_input_embeddings(new_embedding)\n",
"modelA.lm_head.weight = new_embedding.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "9b671b41",
"metadata": {},
"outputs": [],
"source": [
"modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
]
},
{
"cell_type": "code",
"execution_count": 109,
@ -315,40 +162,6 @@
"modelA.lm_head.out_features == tokenizerA.vocab_size"
]
},
{
"cell_type": "markdown",
"id": "f6b39638",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "markdown",
"id": "fbfa8d62",
"metadata": {},
"source": [
"With it:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "998a0ed6",
"metadata": {},
"outputs": [],
"source": [
"# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
]
},
{
"cell_type": "markdown",
"id": "aa8b7ca4",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "code",
"execution_count": 29,
@ -384,120 +197,6 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7673a5e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 26,
"id": "79616f5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "7fc76499",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7d51d201",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2e76534a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a44c465a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "381c712f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "153995fe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
gputil==1.4.0
setuptools==80.9.0
transformers==4.52.4
torch==2.7.1

View File

@ -15,6 +15,14 @@
"# %pip install auto-gptq #==0.4.0"
]
},
{
"cell_type": "markdown",
"id": "9c5573f3",
"metadata": {},
"source": [
"What happens if you rotate the input and output vectors?"
]
},
{
"cell_type": "code",
"execution_count": 1,
@ -38,33 +46,6 @@
"# from auto_gptq import AutoGPTQForCausalLM"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0273f299",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No GPU detected on this system.\n"
]
}
],
"source": [
"gpus = GPUtil.getGPUs()\n",
"if not gpus:\n",
" print(\"No GPU detected on this system.\")\n",
"else:\n",
" for gpu in gpus:\n",
" print(f\"GPU Name: {gpu.name}\")\n",
" print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
" print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
" print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
" print(\"-\" * 40)"
]
},
{
"cell_type": "code",
"execution_count": 2,
@ -117,50 +98,6 @@
"modelA.config.hidden_size == modelB.config.hidden_size "
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "dcfc2d85",
"metadata": {},
"outputs": [],
"source": [
"# replace tokenizer:\n",
"# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n",
"\n",
"# replace token embeddings for input and output:\n",
"# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
"# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
"# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
"\n",
"# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1011d3ad",
"metadata": {},
"outputs": [],
"source": [
"# emb1 = modelA.get_input_embeddings().weight\n",
"# emb2 = modelB.get_input_embeddings().weight\n",
"\n",
"# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
"# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
"\n",
"# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
"\n",
"# print(scaling_factor)\n",
"\n",
"# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
"\n",
"# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
"\n",
"# modelA.set_input_embeddings(new_embedding)\n",
"# modelA.lm_head.weight = new_embedding.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 113,
@ -232,7 +169,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "ff93495e",
"metadata": {},
"outputs": [
@ -256,32 +193,16 @@
"\n",
"print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
"print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
"print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
"print(\"Rotated modelA mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
"\n",
"# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n",
"\n",
"# print(scaling_factor)\n",
"\n",
"# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
"new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n",
"\n",
"\n",
"print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
"\n",
"modelA.set_input_embeddings(new_embedding)\n",
"modelA.lm_head.weight = new_embedding.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "9b671b41",
"metadata": {},
"outputs": [],
"source": [
"modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
]
},
{
"cell_type": "code",
"execution_count": 109,
@ -303,40 +224,6 @@
"modelA.lm_head.out_features == tokenizerA.vocab_size"
]
},
{
"cell_type": "markdown",
"id": "f6b39638",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "markdown",
"id": "fbfa8d62",
"metadata": {},
"source": [
"With it:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "998a0ed6",
"metadata": {},
"outputs": [],
"source": [
"# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
]
},
{
"cell_type": "markdown",
"id": "aa8b7ca4",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "code",
"execution_count": 10,
@ -372,120 +259,6 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7673a5e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 26,
"id": "79616f5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "7fc76499",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7d51d201",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2e76534a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a44c465a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "381c712f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "153995fe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@ -15,6 +15,20 @@
"# %pip install auto-gptq #==0.4.0"
]
},
{
"cell_type": "markdown",
"id": "10e0a35a",
"metadata": {},
"source": [
"What happens if you try to rotate an entire LMM model. Will it still work if you consistently rotate all trained matrices?\n",
"\n",
"Doing this is very specific to the internal representations of a particular LMM. Different models have very different internal layers and representations. Layers may have different shapes, or are concatenated (such as the kvq matrices). \n",
"\n",
"Should all matrices be rotated, and which should be conjugated? \n",
"\n",
"This notebook just offers some base code, it's still far removed from the right approach."
]
},
{
"cell_type": "code",
"execution_count": 1,
@ -117,50 +131,6 @@
"modelA.config.hidden_size == modelB.config.hidden_size "
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "dcfc2d85",
"metadata": {},
"outputs": [],
"source": [
"# replace tokenizer:\n",
"# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n",
"\n",
"# replace token embeddings for input and output:\n",
"# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
"# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
"# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
"\n",
"# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1011d3ad",
"metadata": {},
"outputs": [],
"source": [
"# emb1 = modelA.get_input_embeddings().weight\n",
"# emb2 = modelB.get_input_embeddings().weight\n",
"\n",
"# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
"# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
"\n",
"# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
"\n",
"# print(scaling_factor)\n",
"\n",
"# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
"\n",
"# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
"\n",
"# modelA.set_input_embeddings(new_embedding)\n",
"# modelA.lm_head.weight = new_embedding.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 113,
@ -199,7 +169,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "e1a54c24",
"metadata": {},
"outputs": [],
@ -219,10 +189,7 @@
"\n",
" check_orthogonal(R)\n",
" \n",
" return R # return rotated tensor\n",
"\n",
"\n",
"\n"
" return R # return rotated tensor\n"
]
},
{
@ -320,71 +287,6 @@
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b671b41",
"metadata": {},
"outputs": [],
"source": [
"# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85957357",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# modelA.lm_head.out_features == tokenizerA.vocab_size"
]
},
{
"cell_type": "markdown",
"id": "f6b39638",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "markdown",
"id": "fbfa8d62",
"metadata": {},
"source": [
"With it:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "998a0ed6",
"metadata": {},
"outputs": [],
"source": [
"# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
]
},
{
"cell_type": "markdown",
"id": "aa8b7ca4",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "code",
"execution_count": 66,
@ -420,120 +322,6 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7673a5e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 26,
"id": "79616f5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "7fc76499",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7d51d201",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2e76534a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a44c465a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "381c712f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "153995fe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@ -15,6 +15,14 @@
"# %pip install auto-gptq #==0.4.0"
]
},
{
"cell_type": "markdown",
"id": "8922a3da",
"metadata": {},
"source": [
"What happens when you rescale the input and output embeddings?"
]
},
{
"cell_type": "code",
"execution_count": 1,
@ -117,50 +125,6 @@
"modelA.config.hidden_size == modelB.config.hidden_size "
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "dcfc2d85",
"metadata": {},
"outputs": [],
"source": [
"# replace tokenizer:\n",
"# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n",
"\n",
"# replace token embeddings for input and output:\n",
"# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
"# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
"# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
"\n",
"# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1011d3ad",
"metadata": {},
"outputs": [],
"source": [
"# emb1 = modelA.get_input_embeddings().weight\n",
"# emb2 = modelB.get_input_embeddings().weight\n",
"\n",
"# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
"# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
"\n",
"# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
"\n",
"# print(scaling_factor)\n",
"\n",
"# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
"\n",
"# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
"\n",
"# modelA.set_input_embeddings(new_embedding)\n",
"# modelA.lm_head.weight = new_embedding.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 113,
@ -199,40 +163,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e1a54c24",
"metadata": {},
"outputs": [],
"source": [
"# use proscrustes:\n",
"def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
" # A_centered = A - A.mean(dim=0, keepdim=True)\n",
" # B_centered = B - B.mean(dim=0, keepdim=True)\n",
"\n",
" #M = B_centered.T @ A_centered\n",
" M = B.T @ A\n",
" # find optimal rotation with svd\n",
" U, _, Vt = torch.linalg.svd(M)\n",
"\n",
" # get rotation matrix that aligns B to A\n",
" R = U @ Vt\n",
"\n",
" check_orthogonal(R)\n",
" \n",
" return B @ R # return rotated tensor\n",
"\n",
"def get_rotated_matrix(A, B, n = 1000):\n",
" # use only the first n tokens for rotation:\n",
" # return procrustes(A[:n], B[:n])\n",
" return procrustes(A, B)\n",
" \n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "ff93495e",
"metadata": {},
"outputs": [
@ -251,36 +182,21 @@
"emb1 = modelA.get_input_embeddings().weight\n",
"emb2 = modelB.get_input_embeddings().weight\n",
"\n",
"# emb1_R = get_rotated_matrix(emb2, emb1)\n",
"\n",
"print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
"print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
"# print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
"\n",
"scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
"\n",
"print(scaling_factor)\n",
"print(\"Scaling factor: \", scaling_factor)\n",
"\n",
"# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
"new_embedding = torch.nn.Embedding.from_pretrained(emb1/scaling_factor)\n",
"\n",
"\n",
"print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
"\n",
"modelA.set_input_embeddings(new_embedding)\n",
"modelA.lm_head.weight = new_embedding.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "9b671b41",
"metadata": {},
"outputs": [],
"source": [
"modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
]
},
{
"cell_type": "code",
"execution_count": 109,
@ -302,40 +218,6 @@
"modelA.lm_head.out_features == tokenizerA.vocab_size"
]
},
{
"cell_type": "markdown",
"id": "f6b39638",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "markdown",
"id": "fbfa8d62",
"metadata": {},
"source": [
"With it:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "998a0ed6",
"metadata": {},
"outputs": [],
"source": [
"# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
]
},
{
"cell_type": "markdown",
"id": "aa8b7ca4",
"metadata": {},
"source": [
"Text:"
]
},
{
"cell_type": "code",
"execution_count": 12,
@ -371,120 +253,6 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7673a5e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 26,
"id": "79616f5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "7fc76499",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7d51d201",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2e76534a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n",
" 11341, 389, 262, 976])\n"
]
}
],
"source": [
"tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
"print(tok.input_ids[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a44c465a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "381c712f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "153995fe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {