From 584776b39ae911746aa24622fa4a0a99b45116d3 Mon Sep 17 00:00:00 2001 From: Mick Date: Thu, 19 Jun 2025 17:40:23 +0200 Subject: [PATCH] added some comments and cleanup (still not great) --- README.md | 18 ++- embedding.ipynb | 73 +++++++--- reflection.ipynb | 323 ++----------------------------------------- requirements.txt | 4 + rotation.ipynb | 247 ++------------------------------- rotation_fixed.ipynb | 244 +++----------------------------- scaling.ipynb | 252 ++------------------------------- 7 files changed, 125 insertions(+), 1036 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index b426332..219e408 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,19 @@ # Frankenstein -embedding hackathon repo \ No newline at end of file +Embedding hackathon repo + +Notebook embedding.ipynb: +Try to swap components between 2 LMM models. Recommended to start. +Has more comments and helper functions. + + +Notebooks rotation, scaling and reflection.ipynb: +Only attempt an operation on the input and output embeddings of one model. +Does it break the model? Or is it invariant? + + +Notebook rotation_fixed.ipynb: +This notebook then attempts to rotate the entire model. So all weights in all transformer layers, etc. +This is not as easy as it sounds, and highly model specific: different models have very different internal layers and representations. Layers may have different shapes, or are concatenated (such as the kvq matrices). + + diff --git a/embedding.ipynb b/embedding.ipynb index bbc01ba..40ec2c4 100644 --- a/embedding.ipynb +++ b/embedding.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "e28fb85c", "metadata": {}, "outputs": [], @@ -15,6 +15,17 @@ "# %pip install auto-gptq #==0.4.0" ] }, + { + "cell_type": "markdown", + "id": "01f3cd1d", + "metadata": {}, + "source": [ + "This notebook attempts to open up 2 LMM models, and swap out the input and output embeddings and tokenizer.\n", + "It then runs the model to see if the output still makes sense (hint: actually I dont think you need a hint)\n", + "\n", + "Be mindful of the difference between weights[:]= and weights= (I probably messed that up pretty often)" + ] + }, { "cell_type": "code", "execution_count": 12, @@ -89,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "1da291ed", "metadata": {}, "outputs": [ @@ -105,20 +116,27 @@ } ], "source": [ + "# Check if the dimensionality is identical\n", "modelA.config.hidden_size == modelB.config.hidden_size " ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "dcfc2d85", "metadata": {}, "outputs": [], "source": [ - "# replace tokenizer:\n", - "# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n", + "## ATTEMPT 1: \n", + "## replace input and output embeddings (tied), \n", + "## optionally resize token embeddings (probably not needed, definitely not needed if tokenizer is the same)\n", + "## optionally also swap the positional encoding (wpe) \n", "\n", - "# replace token embeddings for input and output:\n", + "\n", + "### replace tokenizer:\n", + "## modelA.Tokenizer = tokenizerB # not necessary: the pipeline will not access this directly anyway\n", + "\n", + "## replace token embeddings for input and output:\n", "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n", "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n", "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n", @@ -134,6 +152,9 @@ "metadata": {}, "outputs": [], "source": [ + "## ATTEMPT 2:\n", + "## add a scaling factor, attempting to normalize weight magnitude to match the other model better\n", + "\n", "# emb1 = modelA.get_input_embeddings().weight\n", "# emb2 = modelB.get_input_embeddings().weight\n", "\n", @@ -154,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "id": "c62b2f41", "metadata": {}, "outputs": [ @@ -169,6 +190,8 @@ } ], "source": [ + "# check the max and mean values (should not be too big) and nan values (should not occur)\n", + "\n", "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n", "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n", "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())" @@ -176,11 +199,12 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": null, "id": "2b9893a3", "metadata": {}, "outputs": [], "source": [ + "# optional check if the resulting rotational matrix is an orthogonal rotation matrix (it should be, within e-5 ish)\n", "def check_orthogonal(R):\n", " I = torch.eye(R.size(0), device=R.device)\n", " delta = torch.norm(R.T @ R - I)\n", @@ -190,18 +214,22 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": null, "id": "e1a54c24", "metadata": {}, "outputs": [], "source": [ "# use proscrustes:\n", "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n", + " # center the model - recommended not to do this.. \n", + " # rotational matrix may improve but the shift kills everything\n", + " # if you do this, probably should shift it back afterwards again in the rotated frame.\n", " # A_centered = A - A.mean(dim=0, keepdim=True)\n", " # B_centered = B - B.mean(dim=0, keepdim=True)\n", "\n", " #M = B_centered.T @ A_centered\n", " M = B.T @ A\n", + "\n", " # find optimal rotation with svd\n", " U, _, Vt = torch.linalg.svd(M)\n", "\n", @@ -210,11 +238,12 @@ "\n", " check_orthogonal(R)\n", " \n", - " return B @ R # return rotated tensor\n", + " return B @ R # return rotated tensor B\n", "\n", "def get_rotated_matrix(A, B, n = 1000):\n", " # use only the first n tokens for rotation:\n", " # return procrustes(A[:n], B[:n])\n", + " # or use all, if the model is small enough (it's usually fine, and a badly rotated matrix causes problems):\n", " return procrustes(A, B)\n", " \n", "\n", @@ -241,6 +270,9 @@ } ], "source": [ + "# THIRD ATTEMPT:\n", + "# try aligning with Procrustes first before swapping. Still also uses rescaling.\n", + "\n", "emb1 = modelA.get_input_embeddings().weight\n", "emb2 = modelB.get_input_embeddings().weight\n", "\n", @@ -266,17 +298,18 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": null, "id": "9b671b41", "metadata": {}, "outputs": [], "source": [ + "# optionally swap the position encoding weights.\n", "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight" ] }, { "cell_type": "code", - "execution_count": 109, + "execution_count": null, "id": "85957357", "metadata": {}, "outputs": [ @@ -292,6 +325,7 @@ } ], "source": [ + "# check if the output shape matches the tokenizer vocab size (it should)\n", "modelA.lm_head.out_features == tokenizerA.vocab_size" ] }, @@ -318,7 +352,10 @@ "metadata": {}, "outputs": [], "source": [ - "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n" + "# make extra sure the input and output weights are tied (if you don't trust that they are in your model)\n", + "# usually this should change nothing: they are often even the same object if they are tied.\n", + "# plus we already set this explicitly before, even if it was probably not needed.\n", + "# modelA.lm_head.weight = modelA.get_input_embeddings().weight \n" ] }, { @@ -375,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "79616f5c", "metadata": {}, "outputs": [ @@ -391,12 +428,13 @@ } ], "source": [ + "# extra check that input and output weights match\n", "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "7fc76499", "metadata": {}, "outputs": [ @@ -412,12 +450,13 @@ } ], "source": [ + "# extra check that input and output weights match\n", "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "7d51d201", "metadata": {}, "outputs": [ @@ -431,6 +470,8 @@ } ], "source": [ + "# print the token IDs for the given string -> turns out these gpt2 and neo use the same tokenizer\n", + "# thats why swapping them in the pipeline had no effect at all (by itself should break model unless identical) \n", "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", "print(tok.input_ids[0])" ] diff --git a/reflection.ipynb b/reflection.ipynb index 18328b2..00de06b 100644 --- a/reflection.ipynb +++ b/reflection.ipynb @@ -15,6 +15,14 @@ "# %pip install auto-gptq #==0.4.0" ] }, + { + "cell_type": "markdown", + "id": "f16a013c", + "metadata": {}, + "source": [ + "What happens when you reflect or invert the input and output embeddings?" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -38,33 +46,6 @@ "# from auto_gptq import AutoGPTQForCausalLM" ] }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0273f299", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No GPU detected on this system.\n" - ] - } - ], - "source": [ - "gpus = GPUtil.getGPUs()\n", - "if not gpus:\n", - " print(\"No GPU detected on this system.\")\n", - "else:\n", - " for gpu in gpus:\n", - " print(f\"GPU Name: {gpu.name}\")\n", - " print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n", - " print(f\"Free VRAM: {gpu.memoryFree} MB\")\n", - " print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n", - " print(\"-\" * 40)" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -117,119 +98,6 @@ "modelA.config.hidden_size == modelB.config.hidden_size " ] }, - { - "cell_type": "code", - "execution_count": 60, - "id": "dcfc2d85", - "metadata": {}, - "outputs": [], - "source": [ - "# replace tokenizer:\n", - "# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n", - "\n", - "# replace token embeddings for input and output:\n", - "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n", - "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n", - "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n", - "\n", - "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1011d3ad", - "metadata": {}, - "outputs": [], - "source": [ - "# emb1 = modelA.get_input_embeddings().weight\n", - "# emb2 = modelB.get_input_embeddings().weight\n", - "\n", - "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", - "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", - "\n", - "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n", - "\n", - "# print(scaling_factor)\n", - "\n", - "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n", - "\n", - "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", - "\n", - "# modelA.set_input_embeddings(new_embedding)\n", - "# modelA.lm_head.weight = new_embedding.weight\n" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "id": "c62b2f41", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor(False)\n", - "tensor(22.2842, grad_fn=)\n", - "tensor(11.5013, grad_fn=)\n" - ] - } - ], - "source": [ - "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n", - "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n", - "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2b9893a3", - "metadata": {}, - "outputs": [], - "source": [ - "def check_orthogonal(R):\n", - " I = torch.eye(R.size(0), device=R.device)\n", - " delta = torch.norm(R.T @ R - I)\n", - " print(f\"Delta: {delta:.6e}\")\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e1a54c24", - "metadata": {}, - "outputs": [], - "source": [ - "# use proscrustes:\n", - "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n", - " # A_centered = A - A.mean(dim=0, keepdim=True)\n", - " # B_centered = B - B.mean(dim=0, keepdim=True)\n", - "\n", - " #M = B_centered.T @ A_centered\n", - " M = B.T @ A\n", - " # find optimal rotation with svd\n", - " U, _, Vt = torch.linalg.svd(M)\n", - "\n", - " # get rotation matrix that aligns B to A\n", - " R = U @ Vt\n", - "\n", - " check_orthogonal(R)\n", - " \n", - " return B @ R # return rotated tensor\n", - "\n", - "def get_rotated_matrix(A, B, n = 1000):\n", - " # use only the first n tokens for rotation:\n", - " # return procrustes(A[:n], B[:n])\n", - " return procrustes(A, B)\n", - " \n", - "\n", - "\n" - ] - }, { "cell_type": "code", "execution_count": 27, @@ -243,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "ff93495e", "metadata": {}, "outputs": [ @@ -259,41 +127,20 @@ ], "source": [ "\n", - "\n", - "# emb1_R = get_rotated_matrix(emb2, emb1)\n", - "\n", - "#emb1_R = emb1 * -1\n", - "# emb1_R = emb1.T\n", + "# flip the matrix\n", "emb1_R = emb1.flip(dims=[0]) # .reverse() #[::-1]\n", "\n", "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", - "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", - "print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n", + "print(\"Rotated modelA mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n", "\n", - "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n", - "\n", - "# print(scaling_factor)\n", - "\n", - "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n", "new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n", "\n", - "\n", "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", "\n", "modelA.set_input_embeddings(new_embedding)\n", "modelA.lm_head.weight = new_embedding.weight\n" ] }, - { - "cell_type": "code", - "execution_count": 107, - "id": "9b671b41", - "metadata": {}, - "outputs": [], - "source": [ - "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight" - ] - }, { "cell_type": "code", "execution_count": 109, @@ -315,40 +162,6 @@ "modelA.lm_head.out_features == tokenizerA.vocab_size" ] }, - { - "cell_type": "markdown", - "id": "f6b39638", - "metadata": {}, - "source": [ - "Text:" - ] - }, - { - "cell_type": "markdown", - "id": "fbfa8d62", - "metadata": {}, - "source": [ - "With it:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "998a0ed6", - "metadata": {}, - "outputs": [], - "source": [ - "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n" - ] - }, - { - "cell_type": "markdown", - "id": "aa8b7ca4", - "metadata": {}, - "source": [ - "Text:" - ] - }, { "cell_type": "code", "execution_count": 29, @@ -384,120 +197,6 @@ "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7673a5e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "79616f5c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "7fc76499", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7d51d201", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e76534a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a44c465a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "381c712f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "153995fe", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..855f2e4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +gputil==1.4.0 +setuptools==80.9.0 +transformers==4.52.4 +torch==2.7.1 \ No newline at end of file diff --git a/rotation.ipynb b/rotation.ipynb index 69daebe..da80835 100644 --- a/rotation.ipynb +++ b/rotation.ipynb @@ -15,6 +15,14 @@ "# %pip install auto-gptq #==0.4.0" ] }, + { + "cell_type": "markdown", + "id": "9c5573f3", + "metadata": {}, + "source": [ + "What happens if you rotate the input and output vectors?" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -38,33 +46,6 @@ "# from auto_gptq import AutoGPTQForCausalLM" ] }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0273f299", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No GPU detected on this system.\n" - ] - } - ], - "source": [ - "gpus = GPUtil.getGPUs()\n", - "if not gpus:\n", - " print(\"No GPU detected on this system.\")\n", - "else:\n", - " for gpu in gpus:\n", - " print(f\"GPU Name: {gpu.name}\")\n", - " print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n", - " print(f\"Free VRAM: {gpu.memoryFree} MB\")\n", - " print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n", - " print(\"-\" * 40)" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -117,50 +98,6 @@ "modelA.config.hidden_size == modelB.config.hidden_size " ] }, - { - "cell_type": "code", - "execution_count": 60, - "id": "dcfc2d85", - "metadata": {}, - "outputs": [], - "source": [ - "# replace tokenizer:\n", - "# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n", - "\n", - "# replace token embeddings for input and output:\n", - "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n", - "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n", - "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n", - "\n", - "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1011d3ad", - "metadata": {}, - "outputs": [], - "source": [ - "# emb1 = modelA.get_input_embeddings().weight\n", - "# emb2 = modelB.get_input_embeddings().weight\n", - "\n", - "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", - "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", - "\n", - "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n", - "\n", - "# print(scaling_factor)\n", - "\n", - "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n", - "\n", - "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", - "\n", - "# modelA.set_input_embeddings(new_embedding)\n", - "# modelA.lm_head.weight = new_embedding.weight\n" - ] - }, { "cell_type": "code", "execution_count": 113, @@ -232,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "ff93495e", "metadata": {}, "outputs": [ @@ -256,32 +193,16 @@ "\n", "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", "print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", - "print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n", + "print(\"Rotated modelA mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n", "\n", - "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n", - "\n", - "# print(scaling_factor)\n", - "\n", - "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n", "new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n", "\n", - "\n", "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", "\n", "modelA.set_input_embeddings(new_embedding)\n", "modelA.lm_head.weight = new_embedding.weight\n" ] }, - { - "cell_type": "code", - "execution_count": 107, - "id": "9b671b41", - "metadata": {}, - "outputs": [], - "source": [ - "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight" - ] - }, { "cell_type": "code", "execution_count": 109, @@ -303,40 +224,6 @@ "modelA.lm_head.out_features == tokenizerA.vocab_size" ] }, - { - "cell_type": "markdown", - "id": "f6b39638", - "metadata": {}, - "source": [ - "Text:" - ] - }, - { - "cell_type": "markdown", - "id": "fbfa8d62", - "metadata": {}, - "source": [ - "With it:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "998a0ed6", - "metadata": {}, - "outputs": [], - "source": [ - "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n" - ] - }, - { - "cell_type": "markdown", - "id": "aa8b7ca4", - "metadata": {}, - "source": [ - "Text:" - ] - }, { "cell_type": "code", "execution_count": 10, @@ -372,120 +259,6 @@ "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7673a5e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "79616f5c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "7fc76499", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7d51d201", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e76534a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a44c465a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "381c712f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "153995fe", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/rotation_fixed.ipynb b/rotation_fixed.ipynb index 36829f5..fc47ce1 100644 --- a/rotation_fixed.ipynb +++ b/rotation_fixed.ipynb @@ -15,6 +15,20 @@ "# %pip install auto-gptq #==0.4.0" ] }, + { + "cell_type": "markdown", + "id": "10e0a35a", + "metadata": {}, + "source": [ + "What happens if you try to rotate an entire LMM model. Will it still work if you consistently rotate all trained matrices?\n", + "\n", + "Doing this is very specific to the internal representations of a particular LMM. Different models have very different internal layers and representations. Layers may have different shapes, or are concatenated (such as the kvq matrices). \n", + "\n", + "Should all matrices be rotated, and which should be conjugated? \n", + "\n", + "This notebook just offers some base code, it's still far removed from the right approach." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -117,50 +131,6 @@ "modelA.config.hidden_size == modelB.config.hidden_size " ] }, - { - "cell_type": "code", - "execution_count": 60, - "id": "dcfc2d85", - "metadata": {}, - "outputs": [], - "source": [ - "# replace tokenizer:\n", - "# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n", - "\n", - "# replace token embeddings for input and output:\n", - "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n", - "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n", - "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n", - "\n", - "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1011d3ad", - "metadata": {}, - "outputs": [], - "source": [ - "# emb1 = modelA.get_input_embeddings().weight\n", - "# emb2 = modelB.get_input_embeddings().weight\n", - "\n", - "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", - "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", - "\n", - "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n", - "\n", - "# print(scaling_factor)\n", - "\n", - "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n", - "\n", - "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", - "\n", - "# modelA.set_input_embeddings(new_embedding)\n", - "# modelA.lm_head.weight = new_embedding.weight\n" - ] - }, { "cell_type": "code", "execution_count": 113, @@ -199,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "e1a54c24", "metadata": {}, "outputs": [], @@ -219,10 +189,7 @@ "\n", " check_orthogonal(R)\n", " \n", - " return R # return rotated tensor\n", - "\n", - "\n", - "\n" + " return R # return rotated tensor\n" ] }, { @@ -320,71 +287,6 @@ " \n" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b671b41", - "metadata": {}, - "outputs": [], - "source": [ - "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85957357", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# modelA.lm_head.out_features == tokenizerA.vocab_size" - ] - }, - { - "cell_type": "markdown", - "id": "f6b39638", - "metadata": {}, - "source": [ - "Text:" - ] - }, - { - "cell_type": "markdown", - "id": "fbfa8d62", - "metadata": {}, - "source": [ - "With it:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "998a0ed6", - "metadata": {}, - "outputs": [], - "source": [ - "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n" - ] - }, - { - "cell_type": "markdown", - "id": "aa8b7ca4", - "metadata": {}, - "source": [ - "Text:" - ] - }, { "cell_type": "code", "execution_count": 66, @@ -420,120 +322,6 @@ "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7673a5e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "79616f5c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "7fc76499", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7d51d201", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e76534a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a44c465a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "381c712f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "153995fe", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/scaling.ipynb b/scaling.ipynb index d9de1d1..dbceebf 100644 --- a/scaling.ipynb +++ b/scaling.ipynb @@ -15,6 +15,14 @@ "# %pip install auto-gptq #==0.4.0" ] }, + { + "cell_type": "markdown", + "id": "8922a3da", + "metadata": {}, + "source": [ + "What happens when you rescale the input and output embeddings?" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -117,50 +125,6 @@ "modelA.config.hidden_size == modelB.config.hidden_size " ] }, - { - "cell_type": "code", - "execution_count": 60, - "id": "dcfc2d85", - "metadata": {}, - "outputs": [], - "source": [ - "# replace tokenizer:\n", - "# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n", - "\n", - "# replace token embeddings for input and output:\n", - "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n", - "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n", - "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n", - "\n", - "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1011d3ad", - "metadata": {}, - "outputs": [], - "source": [ - "# emb1 = modelA.get_input_embeddings().weight\n", - "# emb2 = modelB.get_input_embeddings().weight\n", - "\n", - "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", - "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", - "\n", - "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n", - "\n", - "# print(scaling_factor)\n", - "\n", - "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n", - "\n", - "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", - "\n", - "# modelA.set_input_embeddings(new_embedding)\n", - "# modelA.lm_head.weight = new_embedding.weight\n" - ] - }, { "cell_type": "code", "execution_count": 113, @@ -199,40 +163,7 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "e1a54c24", - "metadata": {}, - "outputs": [], - "source": [ - "# use proscrustes:\n", - "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n", - " # A_centered = A - A.mean(dim=0, keepdim=True)\n", - " # B_centered = B - B.mean(dim=0, keepdim=True)\n", - "\n", - " #M = B_centered.T @ A_centered\n", - " M = B.T @ A\n", - " # find optimal rotation with svd\n", - " U, _, Vt = torch.linalg.svd(M)\n", - "\n", - " # get rotation matrix that aligns B to A\n", - " R = U @ Vt\n", - "\n", - " check_orthogonal(R)\n", - " \n", - " return B @ R # return rotated tensor\n", - "\n", - "def get_rotated_matrix(A, B, n = 1000):\n", - " # use only the first n tokens for rotation:\n", - " # return procrustes(A[:n], B[:n])\n", - " return procrustes(A, B)\n", - " \n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "ff93495e", "metadata": {}, "outputs": [ @@ -251,36 +182,21 @@ "emb1 = modelA.get_input_embeddings().weight\n", "emb2 = modelB.get_input_embeddings().weight\n", "\n", - "# emb1_R = get_rotated_matrix(emb2, emb1)\n", - "\n", "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", "print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", - "# print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n", "\n", "scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n", "\n", - "print(scaling_factor)\n", + "print(\"Scaling factor: \", scaling_factor)\n", "\n", - "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n", "new_embedding = torch.nn.Embedding.from_pretrained(emb1/scaling_factor)\n", "\n", - "\n", "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", "\n", "modelA.set_input_embeddings(new_embedding)\n", "modelA.lm_head.weight = new_embedding.weight\n" ] }, - { - "cell_type": "code", - "execution_count": 107, - "id": "9b671b41", - "metadata": {}, - "outputs": [], - "source": [ - "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight" - ] - }, { "cell_type": "code", "execution_count": 109, @@ -302,40 +218,6 @@ "modelA.lm_head.out_features == tokenizerA.vocab_size" ] }, - { - "cell_type": "markdown", - "id": "f6b39638", - "metadata": {}, - "source": [ - "Text:" - ] - }, - { - "cell_type": "markdown", - "id": "fbfa8d62", - "metadata": {}, - "source": [ - "With it:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "998a0ed6", - "metadata": {}, - "outputs": [], - "source": [ - "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n" - ] - }, - { - "cell_type": "markdown", - "id": "aa8b7ca4", - "metadata": {}, - "source": [ - "Text:" - ] - }, { "cell_type": "code", "execution_count": 12, @@ -371,120 +253,6 @@ "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7673a5e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "79616f5c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "7fc76499", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7d51d201", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e76534a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", - " 11341, 389, 262, 976])\n" - ] - } - ], - "source": [ - "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", - "print(tok.input_ids[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a44c465a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "381c712f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "153995fe", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {