From 6a0e3df7f26a54bcf7464f23ef7f90cc245c9154 Mon Sep 17 00:00:00 2001
From: Mick <mick.walter@sogeti.com>
Date: Thu, 19 Jun 2025 16:26:02 +0200
Subject: [PATCH] Made a mess. lots of experiments with tweaking embeddings

---
 embedding.ipynb      | 469 +++++++++++++++++++++++++++++++++++-
 reflection.ipynb     | 524 ++++++++++++++++++++++++++++++++++++++++
 rotation.ipynb       | 512 +++++++++++++++++++++++++++++++++++++++
 rotation_fixed.ipynb | 560 +++++++++++++++++++++++++++++++++++++++++++
 scaling.ipynb        | 511 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 2571 insertions(+), 5 deletions(-)
 create mode 100644 reflection.ipynb
 create mode 100644 rotation.ipynb
 create mode 100644 rotation_fixed.ipynb
 create mode 100644 scaling.ipynb

diff --git a/embedding.ipynb b/embedding.ipynb
index 30cb532..bbc01ba 100644
--- a/embedding.ipynb
+++ b/embedding.ipynb
@@ -2,23 +2,482 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "e28fb85c",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install gputil\n",
+    "# %pip install setuptools\n",
+    "# %pip install transformers\n",
+    "# %pip install torch\n",
+    "\n",
+    "# %pip install auto-gptq #==0.4.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0667e71a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import GPUtil\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "import torch\n",
+    "# from auto_gptq import AutoGPTQForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0273f299",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Requirement already satisfied: gputil in ./.venv/lib/python3.12/site-packages (1.4.0)\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
+      "No GPU detected on this system.\n"
      ]
     }
    ],
    "source": [
-    "%pip install gputil\n",
-    "# import gputil"
+    "gpus = GPUtil.getGPUs()\n",
+    "if not gpus:\n",
+    "    print(\"No GPU detected on this system.\")\n",
+    "else:\n",
+    "    for gpu in gpus:\n",
+    "        print(f\"GPU Name: {gpu.name}\")\n",
+    "        print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
+    "        print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
+    "        print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
+    "        print(\"-\" * 40)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "67d7e006",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def grab_model(model_name, quantized = False):\n",
+    "    if quantized:\n",
+    "        model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n",
+    "    else:\n",
+    "        model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    return model, tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "153e9ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA, tokenizerA = grab_model(\"gpt2\")\n",
+    "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n",
+    "\n",
+    "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n",
+    "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1da291ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.config.hidden_size == modelB.config.hidden_size "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "dcfc2d85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace tokenizer:\n",
+    "# modelA.Tokenizer = tokenizerB   # optional when not accessing directly \n",
+    "\n",
+    "# replace token embeddings for input and output:\n",
+    "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
+    "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
+    "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
+    "\n",
+    "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1011d3ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# emb1 = modelA.get_input_embeddings().weight\n",
+    "# emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "\n",
+    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
+    "\n",
+    "# print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
+    "\n",
+    "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "# modelA.set_input_embeddings(new_embedding)\n",
+    "# modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "c62b2f41",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(False)\n",
+      "tensor(22.2842, grad_fn=<MaxBackward1>)\n",
+      "tensor(11.5013, grad_fn=<MeanBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "2b9893a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_orthogonal(R):\n",
+    "    I = torch.eye(R.size(0), device=R.device)\n",
+    "    delta = torch.norm(R.T @ R - I)\n",
+    "    print(f\"Delta: {delta:.6e}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "id": "e1a54c24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use proscrustes:\n",
+    "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
+    "    # A_centered = A - A.mean(dim=0, keepdim=True)\n",
+    "    # B_centered = B - B.mean(dim=0, keepdim=True)\n",
+    "\n",
+    "    #M = B_centered.T @ A_centered\n",
+    "    M = B.T @ A\n",
+    "    # find optimal rotation with svd\n",
+    "    U, _, Vt = torch.linalg.svd(M)\n",
+    "\n",
+    "    # get rotation matrix that aligns B to A\n",
+    "    R = U @ Vt\n",
+    "\n",
+    "    check_orthogonal(R)\n",
+    "    \n",
+    "    return B @ R # return rotated tensor\n",
+    "\n",
+    "def get_rotated_matrix(A, B, n = 1000):\n",
+    "    # use only the first n tokens for rotation:\n",
+    "    # return procrustes(A[:n], B[:n])\n",
+    "    return procrustes(A, B)\n",
+    "    \n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff93495e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Delta: 6.659338e-05\n",
+      "ModelA mean norms: 3.9585366249084473\n",
+      "ModelB mean norms: 11.50130844116211\n",
+      "Rotated modelB mean norms: 11.501314163208008\n",
+      "0.3441812447460622\n",
+      "new_embedding mean norms: 11.501314163208008\n"
+     ]
+    }
+   ],
+   "source": [
+    "emb1 = modelA.get_input_embeddings().weight\n",
+    "emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "emb2_R = get_rotated_matrix(emb1, emb2)\n",
+    "\n",
+    "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "print(\"Rotated modelB mean norms:\", torch.norm(emb2_R, dim=1).mean().item())\n",
+    "\n",
+    "scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n",
+    "\n",
+    "print(scaling_factor)\n",
+    "\n",
+    "new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R)\n",
+    "\n",
+    "\n",
+    "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "modelA.set_input_embeddings(new_embedding)\n",
+    "modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "9b671b41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "85957357",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.out_features == tokenizerA.vocab_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6b39638",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbfa8d62",
+   "metadata": {},
+   "source": [
+    "With it:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "998a0ed6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa8b7ca4",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "id": "d8d9d612",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cpu\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'generated_text': 'Hello, how are you? the he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# use model\n",
+    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
+    "print(pipe(\"Hello, how are you?\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc72ea8a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7673a5e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "79616f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7fc76499",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7d51d201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2e76534a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44c465a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "381c712f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "153995fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/reflection.ipynb b/reflection.ipynb
new file mode 100644
index 0000000..18328b2
--- /dev/null
+++ b/reflection.ipynb
@@ -0,0 +1,524 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e28fb85c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install gputil\n",
+    "# %pip install setuptools\n",
+    "# %pip install transformers\n",
+    "# %pip install torch\n",
+    "\n",
+    "# %pip install auto-gptq #==0.4.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0667e71a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import GPUtil\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "import torch\n",
+    "# from auto_gptq import AutoGPTQForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0273f299",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No GPU detected on this system.\n"
+     ]
+    }
+   ],
+   "source": [
+    "gpus = GPUtil.getGPUs()\n",
+    "if not gpus:\n",
+    "    print(\"No GPU detected on this system.\")\n",
+    "else:\n",
+    "    for gpu in gpus:\n",
+    "        print(f\"GPU Name: {gpu.name}\")\n",
+    "        print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
+    "        print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
+    "        print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
+    "        print(\"-\" * 40)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "67d7e006",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def grab_model(model_name, quantized = False):\n",
+    "    if quantized:\n",
+    "        model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n",
+    "    else:\n",
+    "        model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    return model, tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "153e9ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA, tokenizerA = grab_model(\"gpt2\")\n",
+    "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n",
+    "\n",
+    "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n",
+    "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1da291ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.config.hidden_size == modelB.config.hidden_size "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "dcfc2d85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace tokenizer:\n",
+    "# modelA.Tokenizer = tokenizerB   # optional when not accessing directly \n",
+    "\n",
+    "# replace token embeddings for input and output:\n",
+    "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
+    "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
+    "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
+    "\n",
+    "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1011d3ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# emb1 = modelA.get_input_embeddings().weight\n",
+    "# emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "\n",
+    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
+    "\n",
+    "# print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
+    "\n",
+    "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "# modelA.set_input_embeddings(new_embedding)\n",
+    "# modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "c62b2f41",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(False)\n",
+      "tensor(22.2842, grad_fn=<MaxBackward1>)\n",
+      "tensor(11.5013, grad_fn=<MeanBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2b9893a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_orthogonal(R):\n",
+    "    I = torch.eye(R.size(0), device=R.device)\n",
+    "    delta = torch.norm(R.T @ R - I)\n",
+    "    print(f\"Delta: {delta:.6e}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "e1a54c24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use proscrustes:\n",
+    "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
+    "    # A_centered = A - A.mean(dim=0, keepdim=True)\n",
+    "    # B_centered = B - B.mean(dim=0, keepdim=True)\n",
+    "\n",
+    "    #M = B_centered.T @ A_centered\n",
+    "    M = B.T @ A\n",
+    "    # find optimal rotation with svd\n",
+    "    U, _, Vt = torch.linalg.svd(M)\n",
+    "\n",
+    "    # get rotation matrix that aligns B to A\n",
+    "    R = U @ Vt\n",
+    "\n",
+    "    check_orthogonal(R)\n",
+    "    \n",
+    "    return B @ R # return rotated tensor\n",
+    "\n",
+    "def get_rotated_matrix(A, B, n = 1000):\n",
+    "    # use only the first n tokens for rotation:\n",
+    "    # return procrustes(A[:n], B[:n])\n",
+    "    return procrustes(A, B)\n",
+    "    \n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "0160d672",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emb1 = modelA.get_input_embeddings().weight\n",
+    "emb2 = modelB.get_input_embeddings().weight"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "ff93495e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ModelA mean norms: 3.9585366249084473\n",
+      "Rotated modelB mean norms: 3.958536148071289\n",
+      "new_embedding mean norms: 3.958536148071289\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "# emb1_R = get_rotated_matrix(emb2, emb1)\n",
+    "\n",
+    "#emb1_R = emb1 * -1\n",
+    "# emb1_R = emb1.T\n",
+    "emb1_R = emb1.flip(dims=[0]) # .reverse() #[::-1]\n",
+    "\n",
+    "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
+    "\n",
+    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n",
+    "\n",
+    "# print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
+    "new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n",
+    "\n",
+    "\n",
+    "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "modelA.set_input_embeddings(new_embedding)\n",
+    "modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "9b671b41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "85957357",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.out_features == tokenizerA.vocab_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6b39638",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbfa8d62",
+   "metadata": {},
+   "source": [
+    "With it:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "998a0ed6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa8b7ca4",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "d8d9d612",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cpu\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'generated_text': 'Hello, how are you? Hitman chemicallychev lobeassi composure029capitalist composure exacerbateightingataka harsher Hoy 1886 typew composure curlsidad harsher Babe lobeMach Titus kindred chemicallyRush Intelligent Scare annihilationoblchev harsher Christy Christyansky peppighting typew composure OPEC HitmanEngineDebugchev lobe conceptions partying IGF partying composure 1886 harsherRush castlesGbLESS composure peppightinglations Optical ENTER Tel harsherRush siph composure 1886 chemicallyRushAbysstechnology Rated instructional Scare annihilationchev harsher Christy Christy Leilan repaidevaluate clamp composure peppighting partyingkie partyingRush522 HitmanEngineDebugRushspective629chevAbyss Rated Ada doesnt harsherRush MILL THESE CSI AchievementsCollinschev lobe Ada doesnt harsher Christy Christy feats! kW unjust Ker workaround Hitman Mondays bunnyManufact Mercenary composuregradient OnePlustown grandmaansky upbringingsei781Alternative 465 Kafka partyingMach Trash totem typew grandma733 composure184capitalist Naplesreditary MISS gazedcele composure528 lobe supremacists Hitman DISTRICT Dominic lair harsher Christy Christy Livebushimaruansky upbringingsei amplification SERVikanovych Christy Christyansky Kafkacanon Wanted spears tamp chemically Ker workaround typew 451 annihilation 1889Mach Trash Mondaysprinted]+ RatedPlotcele NETWORK Trash lobe Curve Mercenary composure vitri833 HitmanManufact harsher Christy Christy gazed jihadists typewMach trout baths Trash781 Ker workaroundolded composureWord Lyndon harsher grandma896 lobe decaying annihilation RatedMach FRI PERSON MondaysAlternativeMach FRI redundancy harsher BCC lobe decaying annihilation CONTROLMach ragedcapitalist CRC Helsinki harsher BCC lobe decaying annihilation'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# use model\n",
+    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
+    "print(pipe(\"Hello, how are you?\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc72ea8a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7673a5e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "79616f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7fc76499",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7d51d201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2e76534a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44c465a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "381c712f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "153995fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rotation.ipynb b/rotation.ipynb
new file mode 100644
index 0000000..69daebe
--- /dev/null
+++ b/rotation.ipynb
@@ -0,0 +1,512 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e28fb85c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install gputil\n",
+    "# %pip install setuptools\n",
+    "# %pip install transformers\n",
+    "# %pip install torch\n",
+    "\n",
+    "# %pip install auto-gptq #==0.4.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0667e71a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import GPUtil\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "import torch\n",
+    "# from auto_gptq import AutoGPTQForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0273f299",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No GPU detected on this system.\n"
+     ]
+    }
+   ],
+   "source": [
+    "gpus = GPUtil.getGPUs()\n",
+    "if not gpus:\n",
+    "    print(\"No GPU detected on this system.\")\n",
+    "else:\n",
+    "    for gpu in gpus:\n",
+    "        print(f\"GPU Name: {gpu.name}\")\n",
+    "        print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
+    "        print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
+    "        print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
+    "        print(\"-\" * 40)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "67d7e006",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def grab_model(model_name, quantized = False):\n",
+    "    if quantized:\n",
+    "        model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n",
+    "    else:\n",
+    "        model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    return model, tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "153e9ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA, tokenizerA = grab_model(\"gpt2\")\n",
+    "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n",
+    "\n",
+    "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n",
+    "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1da291ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.config.hidden_size == modelB.config.hidden_size "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "dcfc2d85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace tokenizer:\n",
+    "# modelA.Tokenizer = tokenizerB   # optional when not accessing directly \n",
+    "\n",
+    "# replace token embeddings for input and output:\n",
+    "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
+    "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
+    "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
+    "\n",
+    "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1011d3ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# emb1 = modelA.get_input_embeddings().weight\n",
+    "# emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "\n",
+    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
+    "\n",
+    "# print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
+    "\n",
+    "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "# modelA.set_input_embeddings(new_embedding)\n",
+    "# modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "c62b2f41",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(False)\n",
+      "tensor(22.2842, grad_fn=<MaxBackward1>)\n",
+      "tensor(11.5013, grad_fn=<MeanBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2b9893a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_orthogonal(R):\n",
+    "    I = torch.eye(R.size(0), device=R.device)\n",
+    "    delta = torch.norm(R.T @ R - I)\n",
+    "    print(f\"Delta: {delta:.6e}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e1a54c24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use proscrustes:\n",
+    "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
+    "    # A_centered = A - A.mean(dim=0, keepdim=True)\n",
+    "    # B_centered = B - B.mean(dim=0, keepdim=True)\n",
+    "\n",
+    "    #M = B_centered.T @ A_centered\n",
+    "    M = B.T @ A\n",
+    "    # find optimal rotation with svd\n",
+    "    U, _, Vt = torch.linalg.svd(M)\n",
+    "\n",
+    "    # get rotation matrix that aligns B to A\n",
+    "    R = U @ Vt\n",
+    "\n",
+    "    check_orthogonal(R)\n",
+    "    \n",
+    "    return B @ R # return rotated tensor\n",
+    "\n",
+    "def get_rotated_matrix(A, B, n = 1000):\n",
+    "    # use only the first n tokens for rotation:\n",
+    "    # return procrustes(A[:n], B[:n])\n",
+    "    return procrustes(A, B)\n",
+    "    \n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ff93495e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Delta: 6.907746e-05\n",
+      "ModelA mean norms: 3.9585366249084473\n",
+      "ModelB mean norms: 11.50130844116211\n",
+      "Rotated modelB mean norms: 3.958536148071289\n",
+      "new_embedding mean norms: 3.958536148071289\n"
+     ]
+    }
+   ],
+   "source": [
+    "emb1 = modelA.get_input_embeddings().weight\n",
+    "emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "emb1_R = get_rotated_matrix(emb2, emb1)\n",
+    "\n",
+    "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
+    "\n",
+    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n",
+    "\n",
+    "# print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
+    "new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n",
+    "\n",
+    "\n",
+    "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "modelA.set_input_embeddings(new_embedding)\n",
+    "modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "9b671b41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "85957357",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.out_features == tokenizerA.vocab_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6b39638",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbfa8d62",
+   "metadata": {},
+   "source": [
+    "With it:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "998a0ed6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa8b7ca4",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d8d9d612",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cpu\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'generated_text': 'Hello, how are you?erderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderd cue cue cue cuecue cuecueerd Nicotineerd Nicotineerd Nicotineerd Nicotineerd Nicotine cue Nicotine cue Nicotine cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# use model\n",
+    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
+    "print(pipe(\"Hello, how are you?\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc72ea8a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7673a5e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "79616f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7fc76499",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7d51d201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2e76534a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44c465a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "381c712f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "153995fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rotation_fixed.ipynb b/rotation_fixed.ipynb
new file mode 100644
index 0000000..36829f5
--- /dev/null
+++ b/rotation_fixed.ipynb
@@ -0,0 +1,560 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e28fb85c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install gputil\n",
+    "# %pip install setuptools\n",
+    "# %pip install transformers\n",
+    "# %pip install torch\n",
+    "\n",
+    "# %pip install auto-gptq #==0.4.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0667e71a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import GPUtil\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "import torch\n",
+    "# from auto_gptq import AutoGPTQForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0273f299",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No GPU detected on this system.\n"
+     ]
+    }
+   ],
+   "source": [
+    "gpus = GPUtil.getGPUs()\n",
+    "if not gpus:\n",
+    "    print(\"No GPU detected on this system.\")\n",
+    "else:\n",
+    "    for gpu in gpus:\n",
+    "        print(f\"GPU Name: {gpu.name}\")\n",
+    "        print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
+    "        print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
+    "        print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
+    "        print(\"-\" * 40)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "67d7e006",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def grab_model(model_name, quantized = False):\n",
+    "    if quantized:\n",
+    "        model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n",
+    "    else:\n",
+    "        model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    return model, tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "153e9ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA, tokenizerA = grab_model(\"gpt2\")\n",
+    "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n",
+    "\n",
+    "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n",
+    "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1da291ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.config.hidden_size == modelB.config.hidden_size "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "dcfc2d85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace tokenizer:\n",
+    "# modelA.Tokenizer = tokenizerB   # optional when not accessing directly \n",
+    "\n",
+    "# replace token embeddings for input and output:\n",
+    "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
+    "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
+    "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
+    "\n",
+    "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1011d3ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# emb1 = modelA.get_input_embeddings().weight\n",
+    "# emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "\n",
+    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
+    "\n",
+    "# print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
+    "\n",
+    "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "# modelA.set_input_embeddings(new_embedding)\n",
+    "# modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "c62b2f41",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(False)\n",
+      "tensor(22.2842, grad_fn=<MaxBackward1>)\n",
+      "tensor(11.5013, grad_fn=<MeanBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2b9893a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_orthogonal(R):\n",
+    "    I = torch.eye(R.size(0), device=R.device)\n",
+    "    delta = torch.norm(R.T @ R - I)\n",
+    "    print(f\"Delta: {delta:.6e}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e1a54c24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use proscrustes:\n",
+    "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
+    "    # A_centered = A - A.mean(dim=0, keepdim=True)\n",
+    "    # B_centered = B - B.mean(dim=0, keepdim=True)\n",
+    "\n",
+    "    #M = B_centered.T @ A_centered\n",
+    "    M = B.T @ A\n",
+    "    # find optimal rotation with svd\n",
+    "    U, _, Vt = torch.linalg.svd(M)\n",
+    "\n",
+    "    # get rotation matrix that aligns B to A\n",
+    "    R = U @ Vt\n",
+    "\n",
+    "    check_orthogonal(R)\n",
+    "    \n",
+    "    return R # return rotated tensor\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fedd4d04",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff93495e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Delta: 6.706436e-05\n",
+      "torch.Size([1024, 768])\n"
+     ]
+    }
+   ],
+   "source": [
+    "emb1 = modelA.get_input_embeddings().weight\n",
+    "emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "# get rotation matrix\n",
+    "R = procrustes(emb2, emb1)\n",
+    "emb1_R = emb1 @ R\n",
+    "\n",
+    "new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n",
+    "\n",
+    "modelA.set_input_embeddings(new_embedding)\n",
+    "modelA.lm_head.weight = new_embedding.weight\n",
+    "\n",
+    "# def rotate_weight(W, R):\n",
+    "#     if W.shape[1] == R.shape[0]:\n",
+    "#         return W @ R\n",
+    "#     if W.shape[0] == R.shape[0]:\n",
+    "#         return R.T @ W\n",
+    "\n",
+    "# now fix the other layers by conjugating:\n",
+    "# for block in modelA.transformer.h:\n",
+    "#     for M in [block.attn.c_attn, block.mlp.c_fc]:\n",
+    "#         W = M.weight.data\n",
+    "#         W[:] = R.T @ W\n",
+    "#     for M in [block.attn.c_proj, block.mlp.c_proj]:\n",
+    "#         W = M.weight.data\n",
+    "#         W[:] = R.T @ W @ R\n",
+    "\n",
+    "def split_rotate_concat(W):\n",
+    "    parts1 = [x for x in W.split(768, dim=1)]\n",
+    "    for i, v in enumerate(parts1):\n",
+    "        parts2 = [x for x in v.split(768, dim=0)]\n",
+    "        for j, w in enumerate(parts2):\n",
+    "            parts2[j] = R.T @ w @ R\n",
+    "        parts1[i] = torch.cat(parts2, dim=0)\n",
+    "    return torch.cat(parts1, dim=1)\n",
+    "\n",
+    "\n",
+    "def rotate_layernorm(ln):\n",
+    "    ln.weight.data[:] = ln.weight.data @ R\n",
+    "    ln.bias.data[:] = ln.bias.data @ R\n",
+    "\n",
+    "for block in modelA.transformer.h:\n",
+    "    # print(block.attn.c_attn.weight.data.shape)\n",
+    "    # print(block.mlp.c_fc.weight.data.shape)\n",
+    "    # print(block.attn.c_proj.weight.data.shape)\n",
+    "    # print(block.mlp.c_proj.weight.data.shape)\n",
+    "    # block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data.T).T\n",
+    "    # block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data.T).T\n",
+    "    block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data)\n",
+    "    block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data)\n",
+    "    block.attn.c_proj.weight.data[:] = split_rotate_concat(block.attn.c_proj.weight.data)\n",
+    "    block.mlp.c_proj.weight.data[:] = split_rotate_concat(block.mlp.c_proj.weight.data)\n",
+    "    rotate_layernorm(block.ln_1)\n",
+    "    rotate_layernorm(block.ln_2)\n",
+    "\n",
+    "rotate_layernorm(modelA.transformer.ln_f)\n",
+    "\n",
+    "\n",
+    "print(modelA.transformer.wpe.weight.data.shape)\n",
+    "modelA.transformer.wpe.weight.data[:] = modelA.transformer.wpe.weight.data @ R\n",
+    "\n",
+    "    # for name in ['c_attn', 'c_proj']:\n",
+    "    #     W = getattr(block.attn, name).weight.data\n",
+    "    #     W[:] = R.T @ W @ R\n",
+    "    # w1 = block.mlp.c_fc.weight.data\n",
+    "    # w2 = block.mlp.c_proj.weight.data\n",
+    "    # w1[:] = R.T @ W1 @ R\n",
+    "    # w2[:] = R.T @ W2 @ R\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b671b41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85957357",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# modelA.lm_head.out_features == tokenizerA.vocab_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6b39638",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbfa8d62",
+   "metadata": {},
+   "source": [
+    "With it:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "998a0ed6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa8b7ca4",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "d8d9d612",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cpu\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'generated_text': 'Hello, how are you?orm Coulormormorm Coulorm Coul Coulorm Coul Coulinion Coulorm Coulonomousonomous Coulonomousonomousonomousonomousonomousormonomous Coulorm Coulonomousonomousonomous Coulonomousonomous Coulonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Amenonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomous Coulonomousonomousonomousonomousonomousonomousonomoushered…] Coulonomousonomousonomous Amenomniaifulonomousonomouskeleyifulonomous Amenomniaifulhered Amenkeleyomniastad Coulonomousifulifulomniaifulomniaifulomniaifulifulifulomniaifulomnia…]hered…]ifulomniaifulifulomniastadkeleyomniaifulifulomniaifulomniaifulomniakeleyomniaomniaomnia Coulomniaifulomnia Coulifulomnia Coul Coulkeleyomniastad Coulomnia Coulkeleyomnia Coulkeleyomnia Coulkeleyomniaomnia Coulkeleyomniaomniaomniaomniastadomniaomniaomniaomnia Coulkeleyonomousomnia Coulomniaomniaomnia Coulkeleyomnia Coulomniaomniaomniaomnia Coulomniaomniakeleyomniakeleyomniakeleyomniaomniaomniaomniakeleystadkeleyomniakeleyomniaomnia'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# use model\n",
+    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
+    "print(pipe(\"Hello, how are you?\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc72ea8a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7673a5e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "79616f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7fc76499",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7d51d201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2e76534a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44c465a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "381c712f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "153995fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scaling.ipynb b/scaling.ipynb
new file mode 100644
index 0000000..d9de1d1
--- /dev/null
+++ b/scaling.ipynb
@@ -0,0 +1,511 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e28fb85c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install gputil\n",
+    "# %pip install setuptools\n",
+    "# %pip install transformers\n",
+    "# %pip install torch\n",
+    "\n",
+    "# %pip install auto-gptq #==0.4.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0667e71a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import GPUtil\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "import torch\n",
+    "# from auto_gptq import AutoGPTQForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0273f299",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No GPU detected on this system.\n"
+     ]
+    }
+   ],
+   "source": [
+    "gpus = GPUtil.getGPUs()\n",
+    "if not gpus:\n",
+    "    print(\"No GPU detected on this system.\")\n",
+    "else:\n",
+    "    for gpu in gpus:\n",
+    "        print(f\"GPU Name: {gpu.name}\")\n",
+    "        print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
+    "        print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
+    "        print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
+    "        print(\"-\" * 40)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "67d7e006",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def grab_model(model_name, quantized = False):\n",
+    "    if quantized:\n",
+    "        model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n",
+    "    else:\n",
+    "        model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    return model, tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "153e9ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA, tokenizerA = grab_model(\"gpt2\")\n",
+    "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n",
+    "\n",
+    "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n",
+    "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1da291ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.config.hidden_size == modelB.config.hidden_size "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "dcfc2d85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace tokenizer:\n",
+    "# modelA.Tokenizer = tokenizerB   # optional when not accessing directly \n",
+    "\n",
+    "# replace token embeddings for input and output:\n",
+    "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
+    "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
+    "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
+    "\n",
+    "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1011d3ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# emb1 = modelA.get_input_embeddings().weight\n",
+    "# emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "\n",
+    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
+    "\n",
+    "# print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
+    "\n",
+    "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "# modelA.set_input_embeddings(new_embedding)\n",
+    "# modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "c62b2f41",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(False)\n",
+      "tensor(22.2842, grad_fn=<MaxBackward1>)\n",
+      "tensor(11.5013, grad_fn=<MeanBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
+    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2b9893a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_orthogonal(R):\n",
+    "    I = torch.eye(R.size(0), device=R.device)\n",
+    "    delta = torch.norm(R.T @ R - I)\n",
+    "    print(f\"Delta: {delta:.6e}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e1a54c24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use proscrustes:\n",
+    "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
+    "    # A_centered = A - A.mean(dim=0, keepdim=True)\n",
+    "    # B_centered = B - B.mean(dim=0, keepdim=True)\n",
+    "\n",
+    "    #M = B_centered.T @ A_centered\n",
+    "    M = B.T @ A\n",
+    "    # find optimal rotation with svd\n",
+    "    U, _, Vt = torch.linalg.svd(M)\n",
+    "\n",
+    "    # get rotation matrix that aligns B to A\n",
+    "    R = U @ Vt\n",
+    "\n",
+    "    check_orthogonal(R)\n",
+    "    \n",
+    "    return B @ R # return rotated tensor\n",
+    "\n",
+    "def get_rotated_matrix(A, B, n = 1000):\n",
+    "    # use only the first n tokens for rotation:\n",
+    "    # return procrustes(A[:n], B[:n])\n",
+    "    return procrustes(A, B)\n",
+    "    \n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ff93495e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ModelA mean norms: 1.3624546527862549\n",
+      "ModelB mean norms: 11.50130844116211\n",
+      "0.1184608394563315\n",
+      "new_embedding mean norms: 11.50130844116211\n"
+     ]
+    }
+   ],
+   "source": [
+    "emb1 = modelA.get_input_embeddings().weight\n",
+    "emb2 = modelB.get_input_embeddings().weight\n",
+    "\n",
+    "# emb1_R = get_rotated_matrix(emb2, emb1)\n",
+    "\n",
+    "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
+    "print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
+    "# print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n",
+    "\n",
+    "scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
+    "\n",
+    "print(scaling_factor)\n",
+    "\n",
+    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
+    "new_embedding = torch.nn.Embedding.from_pretrained(emb1/scaling_factor)\n",
+    "\n",
+    "\n",
+    "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
+    "\n",
+    "modelA.set_input_embeddings(new_embedding)\n",
+    "modelA.lm_head.weight = new_embedding.weight\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "9b671b41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "85957357",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.out_features == tokenizerA.vocab_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6b39638",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbfa8d62",
+   "metadata": {},
+   "source": [
+    "With it:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "998a0ed6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa8b7ca4",
+   "metadata": {},
+   "source": [
+    "Text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d8d9d612",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cpu\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'generated_text': 'Hello, how are you?\\n\\nYou are not a new.\\n\\nYou are a new.\\n\\n\\nYou are not a new.\\n\\n\\nYou are not a new.\\n\\n\\na new.\\n\\na new.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na\\n\\na.\\n\\na\\n\\n.\\na\\n\\na\\n\\na\\n\\n.\\na\\n\\na\\n\\n.\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# use model\n",
+    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
+    "print(pipe(\"Hello, how are you?\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc72ea8a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7673a5e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "79616f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7fc76499",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7d51d201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2e76534a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
+      "        11341,   389,   262,   976])\n"
+     ]
+    }
+   ],
+   "source": [
+    "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
+    "print(tok.input_ids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44c465a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "381c712f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "153995fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}