Reran, renamed, added file

2025-06-20 14:47:57 +02:00 · 2025-06-20 14:47:57 +02:00 · 6d21f92177
commit 6d21f92177
parent 00d2b0e777
7 changed files with 548 additions and 198 deletions
--- a/README.md
+++ b/README.md
@ -12,8 +12,10 @@ Only attempt an operation on the input and output embeddings of one model.
 Does it break the model? Or is it invariant?
-Notebook rotation_fixed.ipynb:
+Notebook rotation_all.ipynb:
 This notebook then attempts to rotate the entire model. So all weights in all transformer layers, etc.
 This is not as easy as it sounds, and highly model specific: different models have very different internal layers and representations. Layers may have different shapes, or are concatenated (such as the kvq matrices). 
 Notebook rotation_fixed.ipynb:
 What happens if you try to rotate the input embedding, and then rotate back just before the first activation function in the first neural network? 
--- a/embedding.ipynb
+++ b/embedding.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "id": "e28fb85c",
   "metadata": {},
   "outputs": [],
@ -28,10 +28,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
   "id": "0667e71a",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/transformers/loss/loss_for_object_detection.py:28: UserWarning: A NumPy version >=1.22.4 and <2.3.0 is required for this version of SciPy (detected version 2.3.0)\n",
      "  from scipy.optimize import linear_sum_assignment\n"
     ]
    }
   ],
   "source": [
    "import GPUtil\n",
    "\n",
@ -42,7 +53,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
   "id": "0273f299",
   "metadata": {},
   "outputs": [
@ -69,7 +80,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "id": "67d7e006",
   "metadata": {},
   "outputs": [],
@ -86,7 +97,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "id": "153e9ff5",
   "metadata": {},
   "outputs": [],
@ -102,7 +113,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "id": "1da291ed",
   "metadata": {},
   "outputs": [
@ -112,7 +123,7 @@
       "True"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -124,7 +135,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "id": "dcfc2d85",
   "metadata": {},
   "outputs": [],
@ -149,7 +160,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "1011d3ad",
   "metadata": {},
   "outputs": [],
@ -177,7 +188,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "c62b2f41",
   "metadata": {},
   "outputs": [
@ -201,7 +212,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "id": "2b9893a3",
   "metadata": {},
   "outputs": [],
@ -216,7 +227,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "id": "e1a54c24",
   "metadata": {},
   "outputs": [],
@ -254,7 +265,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "id": "ff93495e",
   "metadata": {},
   "outputs": [
@ -267,7 +278,7 @@
      "ModelB mean norms: 11.50130844116211\n",
      "Rotated modelB mean norms: 11.501314163208008\n",
      "0.3441812447460622\n",
-      "new_embedding mean norms: 11.501314163208008\n"
+      "new_embedding mean norms: 3.958536148071289\n"
     ]
    }
   ],
@ -300,7 +311,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "id": "9b671b41",
   "metadata": {},
   "outputs": [],
@ -311,7 +322,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "id": "85957357",
   "metadata": {},
   "outputs": [
@ -321,7 +332,7 @@
       "True"
      ]
     },
-     "execution_count": 109,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -331,46 +342,23 @@
    "modelA.lm_head.out_features == tokenizerA.vocab_size"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6b39638",
   "metadata": {},
   "source": [
    "Text:"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbfa8d62",
   "metadata": {},
   "source": [
    "With it:\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "id": "998a0ed6",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# make extra sure the input and output weights are tied (if you don't trust that they are in your model)\n",
+    "# code to make extra sure the input and output weights are tied (if you don't trust that they are in your model)\n",
    "# usually this should change nothing: they are often even the same object if they are tied.\n",
-    "# plus we already set this explicitly before, even if it was probably not needed.\n",
+    "# plus we already set this explicitly before, even if it was probably not needed:\n",
    "\n",
    "# modelA.lm_head.weight = modelA.get_input_embeddings().weight \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa8b7ca4",
   "metadata": {},
   "source": [
    "Text:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 21,
   "id": "d8d9d612",
   "metadata": {},
   "outputs": [
@ -386,7 +374,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[{'generated_text': 'Hello, how are you? the he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he'}]\n"
+      "[{'generated_text': 'Hello, how are you? he he he he he he sir he he he he he he he he he he he he he he I I he he he he he he he he he he he he he he he he he sir he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he but he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he I he he he he he he he he he he he he he he he he he he he he he he he he he but he sir he he he he he he he he he he he he but he he he he he he he he he he he he he I he he but he he he he he he he he he he he he he he he he he I he he he he he I he he he he he he he he he he he he he he he he he he he he he he he he he he I he he he he he he he he he he'}]\n"
     ]
    }
   ],
@ -397,24 +385,14 @@
   ]
  },
  {
-   "cell_type": "code",
+   "cell_type": "markdown",
-   "execution_count": null,
+   "id": "5399f694",
   "id": "fc72ea8a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "id": "d7673a5e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79616f5c",
   "metadata": {},
   "outputs": [
@ -424,7 +402,7 @@
       "True"
      ]
     },
-     "execution_count": 26,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -436,7 +414,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "id": "7fc76499",
   "metadata": {},
   "outputs": [
@ -446,7 +424,7 @@
       "True"
      ]
     },
-     "execution_count": 27,
+     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -458,7 +436,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "id": "7d51d201",
   "metadata": {},
   "outputs": [
@ -480,7 +458,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 20,
   "id": "2e76534a",
   "metadata": {},
   "outputs": [
--- a/reflection.ipynb
+++ b/reflection.ipynb
@ -25,7 +25,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "id": "0667e71a",
   "metadata": {},
   "outputs": [
@ -48,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "id": "67d7e006",
   "metadata": {},
   "outputs": [],
@ -65,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 4,
   "id": "153e9ff5",
   "metadata": {},
   "outputs": [],
@ -100,7 +100,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 6,
   "id": "0160d672",
   "metadata": {},
   "outputs": [],
@ -111,7 +111,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "id": "ff93495e",
   "metadata": {},
   "outputs": [
@ -120,7 +120,7 @@
     "output_type": "stream",
     "text": [
      "ModelA mean norms: 3.9585366249084473\n",
-      "Rotated modelB mean norms: 3.958536148071289\n",
+      "Rotated modelA mean norms: 3.958536148071289\n",
      "new_embedding mean norms: 3.958536148071289\n"
     ]
    }
@ -143,7 +143,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 8,
   "id": "85957357",
   "metadata": {},
   "outputs": [
@ -153,7 +153,7 @@
       "True"
      ]
     },
-     "execution_count": 109,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -164,7 +164,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 9,
   "id": "d8d9d612",
   "metadata": {},
   "outputs": [
@ -180,7 +180,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[{'generated_text': 'Hello, how are you? Hitman chemicallychev lobeassi composure029capitalist composure exacerbateightingataka harsher Hoy 1886 typew composure curlsidad harsher Babe lobeMach Titus kindred chemicallyRush Intelligent Scare annihilationoblchev harsher Christy Christyansky peppighting typew composure OPEC HitmanEngineDebugchev lobe conceptions partying IGF partying composure 1886 harsherRush castlesGbLESS composure peppightinglations Optical ENTER Tel harsherRush siph composure 1886 chemicallyRushAbysstechnology Rated instructional Scare annihilationchev harsher Christy Christy Leilan repaidevaluate clamp composure peppighting partyingkie partyingRush522 HitmanEngineDebugRushspective629chevAbyss Rated Ada doesnt harsherRush MILL THESE CSI AchievementsCollinschev lobe Ada doesnt harsher Christy Christy feats! kW unjust Ker workaround Hitman Mondays bunnyManufact Mercenary composuregradient OnePlustown grandmaansky upbringingsei781Alternative 465 Kafka partyingMach Trash totem typew grandma733 composure184capitalist Naplesreditary MISS gazedcele composure528 lobe supremacists Hitman DISTRICT Dominic lair harsher Christy Christy Livebushimaruansky upbringingsei amplification SERVikanovych Christy Christyansky Kafkacanon Wanted spears tamp chemically Ker workaround typew 451 annihilation 1889Mach Trash Mondaysprinted]+ RatedPlotcele NETWORK Trash lobe Curve Mercenary composure vitri833 HitmanManufact harsher Christy Christy gazed jihadists typewMach trout baths Trash781 Ker workaroundolded composureWord Lyndon harsher grandma896 lobe decaying annihilation RatedMach FRI PERSON MondaysAlternativeMach FRI redundancy harsher BCC lobe decaying annihilation CONTROLMach ragedcapitalist CRC Helsinki harsher BCC lobe decaying annihilation'}]\n"
+      "[{'generated_text': 'Hello, how are you? tutor harsher Babe lobe composure OPEC PLUS partying composure 1886 harsher Christy ChristyJs typew composure PLUScapitalist youbush passively BinaryUGH HitmanMachPatch annihilation composure hauled invoking totem MILL954 Rated pavement Mercenary methodological harsher Christy Christyanskyighting Cantor composureintensityscrollcapitalistMach Akron Brilliant Mercenary Innocent HitmanOIL typew briefingsMachTranslation Berkshire composureishy harsher Canary harsherEnhanced harsher Hoyishy harsher Canary harsherAlternative SpoMachatakacapitalist Salon chemicallyMethods annihilation Vortex unrem Hitman chemicallychev typewzie annihilation composureishy harsher Canary harsher annihilation pige harsher Christy Christyaimon composure Dartmouth reef Hitman composureishy harsher Canary harsherEnhancedAlternativeflame TECH annihilationuntarilytera Codeslations composure arra Bicycle harsher Christy Christyanskyishy harsher Canary harsherEnhanced typewWhereas annihilation=]MachgradientVisual Stainless Mercenary composure Kore Hitman chemically composureishy harsher Canary harsherAlternativeflamepicking unloaded Aki Brilliant annihilationSquare composure photoc juvenile aeros harsher Christy Christyanskyishy harsher Canary harsher typew decaying annihilationEconomic annihilationuntarily Optical directional bount composure<72> Mercenary Meridian Hitman chemicallychev typew decaying annihilationEconomic annihilationuntarily unloaded Ironically annihilation Forensicchev bount harsher Christy Christyanskyishy harsher Canary harsher MILL complexion annihilation593Machgradientabuse desolate DiscriminationSolid circadian chemically Elkeneg transistor Mercenary composure Zawt Hitman chemicallychev MILLEconomic annihilation tetherchev bount composure photoc juvenile aeros harsher Christy Christyanskyishy harsher Canary harsher throatsflameEconomic annihilationuntarilytera Codeslations composure arra Bicycle Mercenary Chips'}]\n"
     ]
    }
   ],
--- a/rotation.ipynb
+++ b/rotation.ipynb
@ -25,7 +25,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "id": "0667e71a",
   "metadata": {},
   "outputs": [
@ -48,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "id": "67d7e006",
   "metadata": {},
   "outputs": [],
@ -65,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "id": "153e9ff5",
   "metadata": {},
   "outputs": [],
@ -100,7 +100,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 6,
   "id": "c62b2f41",
   "metadata": {},
   "outputs": [
@ -122,7 +122,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
   "id": "2b9893a3",
   "metadata": {},
   "outputs": [],
@ -136,7 +136,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
   "id": "e1a54c24",
   "metadata": {},
   "outputs": [],
@ -169,7 +169,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "ff93495e",
   "metadata": {},
   "outputs": [
@ -177,11 +177,11 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Delta: 6.907746e-05\n",
+      "Delta: 6.706436e-05\n",
      "ModelA mean norms: 3.9585366249084473\n",
      "ModelB mean norms: 11.50130844116211\n",
-      "Rotated modelB mean norms: 3.958536148071289\n",
+      "Rotated modelA mean norms: 3.9585366249084473\n",
-      "new_embedding mean norms: 3.958536148071289\n"
+      "new_embedding mean norms: 3.9585366249084473\n"
     ]
    }
   ],
@ -205,7 +205,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 10,
   "id": "85957357",
   "metadata": {},
   "outputs": [
@ -215,7 +215,7 @@
       "True"
      ]
     },
-     "execution_count": 109,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -226,7 +226,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
   "id": "d8d9d612",
   "metadata": {},
   "outputs": [
@ -242,7 +242,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[{'generated_text': 'Hello, how are you?erderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderd cue cue cue cuecue cuecueerd Nicotineerd Nicotineerd Nicotineerd Nicotineerd Nicotine cue Nicotine cue Nicotine cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue'}]\n"
+      "[{'generated_text': 'Hello, how are you?erderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderderd cue cue cue cuecue cuecueerd Nicotineerd Nicotineerd Nicotineerd Nicotine cue Nicotine cue Nicotine cue Nicotine cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue cue'}]\n"
     ]
    }
   ],
--- a/rotation_all.ipynb
+++ b/rotation_all.ipynb
@ -0,0 +1,348 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e28fb85c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# %pip install gputil\n",
    "# %pip install setuptools\n",
    "# %pip install transformers\n",
    "# %pip install torch\n",
    "\n",
    "# %pip install auto-gptq #==0.4.0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10e0a35a",
   "metadata": {},
   "source": [
    "What happens if you try to rotate an entire LMM model. Will it still work if you consistently rotate all trained matrices?\n",
    "\n",
    "Doing this is very specific to the internal representations of a particular LMM. Different models have very different internal layers and representations. Layers may have different shapes, or are concatenated (such as the kvq matrices). \n",
    "\n",
    "Should all matrices be rotated, and which should be conjugated? \n",
    "\n",
    "This notebook just offers some base code, it's still far removed from the right approach."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0667e71a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import GPUtil\n",
    "\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
    "import torch\n",
    "# from auto_gptq import AutoGPTQForCausalLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0273f299",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No GPU detected on this system.\n"
     ]
    }
   ],
   "source": [
    "gpus = GPUtil.getGPUs()\n",
    "if not gpus:\n",
    "    print(\"No GPU detected on this system.\")\n",
    "else:\n",
    "    for gpu in gpus:\n",
    "        print(f\"GPU Name: {gpu.name}\")\n",
    "        print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
    "        print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
    "        print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
    "        print(\"-\" * 40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "67d7e006",
   "metadata": {},
   "outputs": [],
   "source": [
    "def grab_model(model_name, quantized = False):\n",
    "    if quantized:\n",
    "        model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n",
    "    else:\n",
    "        model = AutoModelForCausalLM.from_pretrained(model_name)\n",
    "\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "    return model, tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "153e9ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "modelA, tokenizerA = grab_model(\"gpt2\")\n",
    "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n",
    "\n",
    "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n",
    "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1da291ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelA.config.hidden_size == modelB.config.hidden_size "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c62b2f41",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(False)\n",
      "tensor(22.2842, grad_fn=<MaxBackward1>)\n",
      "tensor(11.5013, grad_fn=<MeanBackward0>)\n"
     ]
    }
   ],
   "source": [
    "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2b9893a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_orthogonal(R):\n",
    "    I = torch.eye(R.size(0), device=R.device)\n",
    "    delta = torch.norm(R.T @ R - I)\n",
    "    print(f\"Delta: {delta:.6e}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e1a54c24",
   "metadata": {},
   "outputs": [],
   "source": [
    "# use proscrustes:\n",
    "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
    "    # A_centered = A - A.mean(dim=0, keepdim=True)\n",
    "    # B_centered = B - B.mean(dim=0, keepdim=True)\n",
    "\n",
    "    #M = B_centered.T @ A_centered\n",
    "    M = B.T @ A\n",
    "    # find optimal rotation with svd\n",
    "    U, _, Vt = torch.linalg.svd(M)\n",
    "\n",
    "    # get rotation matrix that aligns B to A\n",
    "    R = U @ Vt\n",
    "\n",
    "    check_orthogonal(R)\n",
    "    \n",
    "    return R # return rotated tensor\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fedd4d04",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ff93495e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Delta: 6.706436e-05\n",
      "torch.Size([1024, 768])\n"
     ]
    }
   ],
   "source": [
    "emb1 = modelA.get_input_embeddings().weight\n",
    "emb2 = modelB.get_input_embeddings().weight\n",
    "\n",
    "# get rotation matrix\n",
    "R = procrustes(emb2, emb1)\n",
    "emb1_R = emb1 @ R\n",
    "\n",
    "new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n",
    "\n",
    "modelA.set_input_embeddings(new_embedding)\n",
    "modelA.lm_head.weight = new_embedding.weight\n",
    "\n",
    "# def rotate_weight(W, R):\n",
    "#     if W.shape[1] == R.shape[0]:\n",
    "#         return W @ R\n",
    "#     if W.shape[0] == R.shape[0]:\n",
    "#         return R.T @ W\n",
    "\n",
    "# now fix the other layers by conjugating:\n",
    "# for block in modelA.transformer.h:\n",
    "#     for M in [block.attn.c_attn, block.mlp.c_fc]:\n",
    "#         W = M.weight.data\n",
    "#         W[:] = R.T @ W\n",
    "#     for M in [block.attn.c_proj, block.mlp.c_proj]:\n",
    "#         W = M.weight.data\n",
    "#         W[:] = R.T @ W @ R\n",
    "\n",
    "def split_rotate_concat(W):\n",
    "    parts1 = [x for x in W.split(768, dim=1)]\n",
    "    for i, v in enumerate(parts1):\n",
    "        parts2 = [x for x in v.split(768, dim=0)]\n",
    "        for j, w in enumerate(parts2):\n",
    "            parts2[j] = R.T @ w @ R\n",
    "        parts1[i] = torch.cat(parts2, dim=0)\n",
    "    return torch.cat(parts1, dim=1)\n",
    "\n",
    "\n",
    "def rotate_layernorm(ln):\n",
    "    ln.weight.data[:] = ln.weight.data @ R\n",
    "    ln.bias.data[:] = ln.bias.data @ R\n",
    "\n",
    "for block in modelA.transformer.h:\n",
    "    # print(block.attn.c_attn.weight.data.shape)\n",
    "    # print(block.mlp.c_fc.weight.data.shape)\n",
    "    # print(block.attn.c_proj.weight.data.shape)\n",
    "    # print(block.mlp.c_proj.weight.data.shape)\n",
    "    # block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data.T).T\n",
    "    # block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data.T).T\n",
    "    block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data)\n",
    "    block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data)\n",
    "    block.attn.c_proj.weight.data[:] = split_rotate_concat(block.attn.c_proj.weight.data)\n",
    "    block.mlp.c_proj.weight.data[:] = split_rotate_concat(block.mlp.c_proj.weight.data)\n",
    "    rotate_layernorm(block.ln_1)\n",
    "    rotate_layernorm(block.ln_2)\n",
    "\n",
    "rotate_layernorm(modelA.transformer.ln_f)\n",
    "\n",
    "\n",
    "print(modelA.transformer.wpe.weight.data.shape)\n",
    "modelA.transformer.wpe.weight.data[:] = modelA.transformer.wpe.weight.data @ R\n",
    "\n",
    "    # for name in ['c_attn', 'c_proj']:\n",
    "    #     W = getattr(block.attn, name).weight.data\n",
    "    #     W[:] = R.T @ W @ R\n",
    "    # w1 = block.mlp.c_fc.weight.data\n",
    "    # w2 = block.mlp.c_proj.weight.data\n",
    "    # w1[:] = R.T @ W1 @ R\n",
    "    # w2[:] = R.T @ W2 @ R\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d8d9d612",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cpu\n",
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'generated_text': 'Hello, how are you?orm Coulorm Coulormorm Coulão Coulorm Coulorm Coulorm Coulorm Coulorm Coulonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomous Coulonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousidableonomousonomousonomousato Coulonomousonomousonomous Coulonomousonomousonomouskeleyonomous Coulorm Amenomniaifulstad Amenonomousonomous Amenstad Amenomnia Amenomniaifulomniaomniaormstadifulifulomnia Coulonomousifulomniaomniaomniaifulomniaomnia Coulifulomniahered Coul Amenomniakeleyomniaomniastadifulomnia Amenomniaomniaomniakeleyomniaomniaomniastad…]omniaomnia Coulkeleyomniaomniaomnia Coulomniakeleyomnia Coulomniaomniaomniaifulomniaomniaomniakeleyomniaomniaomniaomniaomniaomniaomniaomniastadomniaomniaomnia CoulkeleyomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaNRSormkeleyomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomniaomnia'}]\n"
     ]
    }
   ],
   "source": [
    "# use model\n",
    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
    "print(pipe(\"Hello, how are you?\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc72ea8a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/rotation_fixed.ipynb
+++ b/rotation_fixed.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 53,
   "id": "e28fb85c",
   "metadata": {},
   "outputs": [],
@ -20,30 +20,17 @@
   "id": "10e0a35a",
   "metadata": {},
   "source": [
-    "What happens if you try to rotate an entire LMM model. Will it still work if you consistently rotate all trained matrices?\n",
+    "What happens if you try to rotate the input embedding, and then rotate back just before the first activation function in the first neural network? \n",
    "\n",
-    "Doing this is very specific to the internal representations of a particular LMM. Different models have very different internal layers and representations. Layers may have different shapes, or are concatenated (such as the kvq matrices). \n",
+    "That should work, but the input and output embedding are tied, so they have to be untied.\n"
    "\n",
    "Should all matrices be rotated, and which should be conjugated? \n",
    "\n",
    "This notebook just offers some base code, it's still far removed from the right approach."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 54,
   "id": "0667e71a",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import GPUtil\n",
    "\n",
@ -54,7 +41,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 55,
   "id": "0273f299",
   "metadata": {},
   "outputs": [
@ -81,7 +68,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 56,
   "id": "67d7e006",
   "metadata": {},
   "outputs": [],
@ -98,7 +85,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 57,
   "id": "153e9ff5",
   "metadata": {},
   "outputs": [],
@ -112,7 +99,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 58,
   "id": "1da291ed",
   "metadata": {},
   "outputs": [
@ -122,7 +109,7 @@
       "True"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -133,7 +120,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 59,
   "id": "c62b2f41",
   "metadata": {},
   "outputs": [
@ -155,7 +142,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 60,
   "id": "2b9893a3",
   "metadata": {},
   "outputs": [],
@ -169,7 +156,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 61,
   "id": "e1a54c24",
   "metadata": {},
   "outputs": [],
@ -194,15 +181,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 62,
   "id": "fedd4d04",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
-   "source": []
+    {
     "data": {
      "text/plain": [
       "torch.Size([768, 3072])"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelA.transformer.h[0].mlp.c_fc.weight.shape"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 63,
   "id": "ff93495e",
   "metadata": {},
   "outputs": [
@ -223,73 +223,103 @@
    "R = procrustes(emb2, emb1)\n",
    "emb1_R = emb1 @ R\n",
    "\n",
    "original_embedding = emb1.data.clone()\n",
    "\n",
    "new_embedding = torch.nn.Embedding.from_pretrained(emb1_R)\n",
    "\n",
    "modelA.set_input_embeddings(new_embedding)\n",
-    "modelA.lm_head.weight = new_embedding.weight\n",
+    "# modelA.lm_head.weight = new_embedding.weight\n",
    "# modelA.lm_head.weight.data[:] = new_embedding.weight.data[:]\n",
    "\n",
-    "# def rotate_weight(W, R):\n",
+    "# untie the head:\n",
-    "#     if W.shape[1] == R.shape[0]:\n",
+    "modelA.lm_head = torch.nn.Linear(modelA.config.n_embd, modelA.config.vocab_size, bias = False)\n",
-    "#         return W @ R\n",
+    "# modelA.lm_head.weight.data[:] = original_embedding\n",
-    "#     if W.shape[0] == R.shape[0]:\n",
+    "modelA.lm_head.weight.data = original_embedding\n",
    "#         return R.T @ W\n",
    "\n",
    "# now fix the other layers by conjugating:\n",
    "# for block in modelA.transformer.h:\n",
    "#     for M in [block.attn.c_attn, block.mlp.c_fc]:\n",
    "#         W = M.weight.data\n",
    "#         W[:] = R.T @ W\n",
    "#     for M in [block.attn.c_proj, block.mlp.c_proj]:\n",
    "#         W = M.weight.data\n",
    "#         W[:] = R.T @ W @ R\n",
    "\n",
    "def split_rotate_concat(W):\n",
    "    parts1 = [x for x in W.split(768, dim=1)]\n",
    "    for i, v in enumerate(parts1):\n",
    "        parts2 = [x for x in v.split(768, dim=0)]\n",
    "        for j, w in enumerate(parts2):\n",
    "            parts2[j] = R.T @ w @ R\n",
    "        parts1[i] = torch.cat(parts2, dim=0)\n",
    "    return torch.cat(parts1, dim=1)\n",
    "\n",
    "\n",
    "def rotate_layernorm(ln):\n",
    "    ln.weight.data[:] = ln.weight.data @ R\n",
    "    ln.bias.data[:] = ln.bias.data @ R\n",
    "\n",
    "for block in modelA.transformer.h:\n",
    "    # print(block.attn.c_attn.weight.data.shape)\n",
    "    # print(block.mlp.c_fc.weight.data.shape)\n",
    "    # print(block.attn.c_proj.weight.data.shape)\n",
    "    # print(block.mlp.c_proj.weight.data.shape)\n",
    "    # block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data.T).T\n",
    "    # block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data.T).T\n",
    "    block.attn.c_attn.weight.data[:] = split_rotate_concat(block.attn.c_attn.weight.data)\n",
    "    block.mlp.c_fc.weight.data[:] = split_rotate_concat(block.mlp.c_fc.weight.data)\n",
    "    block.attn.c_proj.weight.data[:] = split_rotate_concat(block.attn.c_proj.weight.data)\n",
    "    block.mlp.c_proj.weight.data[:] = split_rotate_concat(block.mlp.c_proj.weight.data)\n",
    "    rotate_layernorm(block.ln_1)\n",
    "    rotate_layernorm(block.ln_2)\n",
    "\n",
    "rotate_layernorm(modelA.transformer.ln_f)\n",
    "\n",
    "# rotate back only the weights of first layer of the first NN encountered (before first activation function)\n",
    "modelA.transformer.h[0].mlp.c_fc.weight.data[:] = (modelA.transformer.h[0].mlp.c_fc.weight.data.T @ R.T).T\n",
    "\n",
    "print(modelA.transformer.wpe.weight.data.shape)\n",
    "modelA.transformer.wpe.weight.data[:] = modelA.transformer.wpe.weight.data @ R\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "a8ef9109",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[True, True, True,  ..., True, True, True],\n",
       "        [True, True, True,  ..., True, True, True],\n",
       "        [True, True, True,  ..., True, True, True],\n",
       "        ...,\n",
       "        [True, True, True,  ..., True, True, True],\n",
       "        [True, True, True,  ..., True, True, True],\n",
       "        [True, True, True,  ..., True, True, True]])"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelA.lm_head.weight == emb1\n",
    "\n",
-    "    # for name in ['c_attn', 'c_proj']:\n",
+    "# but somehow, input and output embedding are still tied...?"
-    "    #     W = getattr(block.attn, name).weight.data\n",
+   ]
-    "    #     W[:] = R.T @ W @ R\n",
+  },
-    "    # w1 = block.mlp.c_fc.weight.data\n",
+  {
-    "    # w2 = block.mlp.c_proj.weight.data\n",
+   "cell_type": "code",
-    "    # w1[:] = R.T @ W1 @ R\n",
+   "execution_count": 65,
-    "    # w2[:] = R.T @ W2 @ R\n",
+   "id": "848d2fc2",
-    "    \n"
+   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelA.transformer.wte is modelA.lm_head.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "33888de4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(890.5013)\n",
      "tensor(890.4819, grad_fn=<LinalgVectorNormBackward0>)\n"
     ]
    }
   ],
   "source": [
    "# modelA.lm_head.weight.data.zero_()\n",
    "print(torch.norm(modelA.transformer.wte.weight))\n",
    "print(torch.norm(modelA.lm_head.weight))\n",
    "\n",
    "# proof: they are untied, and seem rotated."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "d8d9d612",
   "metadata": {},
   "outputs": [
@ -305,7 +335,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[{'generated_text': 'Hello, how are you?orm Coulormormorm Coulorm Coul Coulorm Coul Coulinion Coulorm Coulonomousonomous Coulonomousonomousonomousonomousonomousormonomous Coulorm Coulonomousonomousonomous Coulonomousonomous Coulonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomous Coulonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Amenonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomousonomous Coulonomous Coulonomousonomousonomousonomousonomousonomousonomoushered…] Coulonomousonomousonomous Amenomniaifulonomousonomouskeleyifulonomous Amenomniaifulhered Amenkeleyomniastad Coulonomousifulifulomniaifulomniaifulomniaifulifulifulomniaifulomnia…]hered…]ifulomniaifulifulomniastadkeleyomniaifulifulomniaifulomniaifulomniakeleyomniaomniaomnia Coulomniaifulomnia Coulifulomnia Coul Coulkeleyomniastad Coulomnia Coulkeleyomnia Coulkeleyomnia Coulkeleyomniaomnia Coulkeleyomniaomniaomniaomniastadomniaomniaomniaomnia Coulkeleyonomousomnia Coulomniaomniaomnia Coulkeleyomnia Coulomniaomniaomniaomnia Coulomniaomniakeleyomniakeleyomniakeleyomniaomniaomniaomniakeleystadkeleyomniakeleyomniaomnia'}]\n"
+      "[{'generated_text': 'Hello, how are you?.,, that..,. and\\n\\n the. of...,. this.., and. and.,.\\'the.!. and (...,... the, for, to...,......,.. and,, if your the more.,.., the,., and.., that or.,..,., the the..\\'of the an...... the or.. or and to... to., the.,. the and.. do. to [ that. of that the and,.. who that..,...,... for. for or,.. with the,.. a.,. in a.. (,\\n..... and. or the... the-. the). to, the.. that \",.... you. the on. or.-- of.,. and are on..:\\n of:. and. that that or.,,,.. of, for,,., for the,\\n or,... and, the\\n'}]\n"
     ]
    }
   ],
--- a/scaling.ipynb
+++ b/scaling.ipynb
@ -25,7 +25,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "id": "0667e71a",
   "metadata": {},
   "outputs": [
@ -48,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
   "id": "0273f299",
   "metadata": {},
   "outputs": [
@ -75,7 +75,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "id": "67d7e006",
   "metadata": {},
   "outputs": [],
@ -92,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
   "id": "153e9ff5",
   "metadata": {},
   "outputs": [],
@ -106,7 +106,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "id": "1da291ed",
   "metadata": {},
   "outputs": [
@ -116,7 +116,7 @@
       "True"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -127,7 +127,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 7,
   "id": "c62b2f41",
   "metadata": {},
   "outputs": [
@ -149,7 +149,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
   "id": "2b9893a3",
   "metadata": {},
   "outputs": [],
@ -163,7 +163,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "ff93495e",
   "metadata": {},
   "outputs": [
@ -171,10 +171,10 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ModelA mean norms: 1.3624546527862549\n",
+      "ModelA mean norms: 3.9585366249084473\n",
      "ModelB mean norms: 11.50130844116211\n",
-      "0.1184608394563315\n",
+      "Scaling factor:  0.34418141598056917\n",
-      "new_embedding mean norms: 11.50130844116211\n"
+      "new_embedding mean norms: 11.501307487487793\n"
     ]
    }
   ],
@ -199,7 +199,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 10,
   "id": "85957357",
   "metadata": {},
   "outputs": [
@ -209,7 +209,7 @@
       "True"
      ]
     },
-     "execution_count": 109,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -220,7 +220,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
   "id": "d8d9d612",
   "metadata": {},
   "outputs": [
@ -236,7 +236,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[{'generated_text': 'Hello, how are you?\\n\\nYou are not a new.\\n\\nYou are a new.\\n\\n\\nYou are not a new.\\n\\n\\nYou are not a new.\\n\\n\\na new.\\n\\na new.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na\\n\\na.\\n\\na\\n\\n.\\na\\n\\na\\n\\na\\n\\n.\\na\\n\\na\\n\\n.\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n'}]\n"
+      "[{'generated_text': \"Hello, how are you?\\n\\nYou're not going to be able to know what you're going to do.\\n\\n\\nYou're not going to be able to know what you're going to do.\\n\\n\\nYou're not going to be able to know what you're going to do.\\n\\n\\nYou're not going to be going to be going to be.\\n\\nYou're going to be going to be going to be.\\n\\nYou're going to be going to be.\\n\\nYou're going to be going to be.\\n\\nYou're going to be going to be.\\n\\nYou're going to be going to be.\\n\\nYou're going to be going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou're going to be.\\n\\nYou\"}]\n"
     ]
    }
   ],
@ -245,14 +245,6 @@
    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
    "print(pipe(\"Hello, how are you?\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc72ea8a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {