Frankenstein/embedding.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e28fb85c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# %pip install gputil\n",
    "# %pip install setuptools\n",
    "# %pip install transformers\n",
    "# %pip install torch\n",
    "\n",
    "# %pip install auto-gptq #==0.4.0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01f3cd1d",
   "metadata": {},
   "source": [
    "This notebook attempts to open up 2 LMM models, and swap out the input and output embeddings and tokenizer.\n",
    "It then runs the model to see if the output still makes sense (hint: actually I dont think you need a hint)\n",
    "\n",
    "Be mindful of the difference between weights[:]= and weights= (I probably messed that up pretty often)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0667e71a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import GPUtil\n",
    "\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
    "import torch\n",
    "# from auto_gptq import AutoGPTQForCausalLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0273f299",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No GPU detected on this system.\n"
     ]
    }
   ],
   "source": [
    "gpus = GPUtil.getGPUs()\n",
    "if not gpus:\n",
    "    print(\"No GPU detected on this system.\")\n",
    "else:\n",
    "    for gpu in gpus:\n",
    "        print(f\"GPU Name: {gpu.name}\")\n",
    "        print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n",
    "        print(f\"Free VRAM: {gpu.memoryFree} MB\")\n",
    "        print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n",
    "        print(\"-\" * 40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "67d7e006",
   "metadata": {},
   "outputs": [],
   "source": [
    "def grab_model(model_name, quantized = False):\n",
    "    if quantized:\n",
    "        model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n",
    "    else:\n",
    "        model = AutoModelForCausalLM.from_pretrained(model_name)\n",
    "\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "    return model, tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "153e9ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "modelA, tokenizerA = grab_model(\"gpt2\")\n",
    "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n",
    "\n",
    "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n",
    "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1da291ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check if the dimensionality is identical\n",
    "modelA.config.hidden_size == modelB.config.hidden_size "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcfc2d85",
   "metadata": {},
   "outputs": [],
   "source": [
    "## ATTEMPT 1: \n",
    "## replace input and output embeddings (tied), \n",
    "## optionally resize token embeddings (probably not needed, definitely not needed if tokenizer is the same)\n",
    "## optionally also swap the positional encoding (wpe) \n",
    "\n",
    "\n",
    "### replace tokenizer:\n",
    "## modelA.Tokenizer = tokenizerB   # not necessary: the pipeline will not access this directly anyway\n",
    "\n",
    "## replace token embeddings for input and output:\n",
    "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n",
    "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n",
    "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n",
    "\n",
    "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1011d3ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "## ATTEMPT 2:\n",
    "## add a scaling factor, attempting to normalize weight magnitude to match the other model better\n",
    "\n",
    "# emb1 = modelA.get_input_embeddings().weight\n",
    "# emb2 = modelB.get_input_embeddings().weight\n",
    "\n",
    "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
    "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
    "\n",
    "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n",
    "\n",
    "# print(scaling_factor)\n",
    "\n",
    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n",
    "\n",
    "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
    "\n",
    "# modelA.set_input_embeddings(new_embedding)\n",
    "# modelA.lm_head.weight = new_embedding.weight\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c62b2f41",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(False)\n",
      "tensor(22.2842, grad_fn=<MaxBackward1>)\n",
      "tensor(11.5013, grad_fn=<MeanBackward0>)\n"
     ]
    }
   ],
   "source": [
    "# check the max and mean values (should not be too big) and nan values (should not occur)\n",
    "\n",
    "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n",
    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n",
    "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b9893a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# optional check if the resulting rotational matrix is an orthogonal rotation matrix (it should be, within e-5 ish)\n",
    "def check_orthogonal(R):\n",
    "    I = torch.eye(R.size(0), device=R.device)\n",
    "    delta = torch.norm(R.T @ R - I)\n",
    "    print(f\"Delta: {delta:.6e}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1a54c24",
   "metadata": {},
   "outputs": [],
   "source": [
    "# use proscrustes:\n",
    "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n",
    "    # center the model - recommended not to do this.. \n",
    "    # rotational matrix may improve but the shift kills everything\n",
    "    # if you do this, probably should shift it back afterwards again in the rotated frame.\n",
    "    # A_centered = A - A.mean(dim=0, keepdim=True)\n",
    "    # B_centered = B - B.mean(dim=0, keepdim=True)\n",
    "\n",
    "    #M = B_centered.T @ A_centered\n",
    "    M = B.T @ A\n",
    "\n",
    "    # find optimal rotation with svd\n",
    "    U, _, Vt = torch.linalg.svd(M)\n",
    "\n",
    "    # get rotation matrix that aligns B to A\n",
    "    R = U @ Vt\n",
    "\n",
    "    check_orthogonal(R)\n",
    "    \n",
    "    return B @ R # return rotated tensor B\n",
    "\n",
    "def get_rotated_matrix(A, B, n = 1000):\n",
    "    # use only the first n tokens for rotation:\n",
    "    # return procrustes(A[:n], B[:n])\n",
    "    # or use all, if the model is small enough (it's usually fine, and a badly rotated matrix causes problems):\n",
    "    return procrustes(A, B)\n",
    "    \n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff93495e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Delta: 6.659338e-05\n",
      "ModelA mean norms: 3.9585366249084473\n",
      "ModelB mean norms: 11.50130844116211\n",
      "Rotated modelB mean norms: 11.501314163208008\n",
      "0.3441812447460622\n",
      "new_embedding mean norms: 11.501314163208008\n"
     ]
    }
   ],
   "source": [
    "# THIRD ATTEMPT:\n",
    "# try aligning with Procrustes first before swapping. Still also uses rescaling.\n",
    "\n",
    "emb1 = modelA.get_input_embeddings().weight\n",
    "emb2 = modelB.get_input_embeddings().weight\n",
    "\n",
    "emb2_R = get_rotated_matrix(emb1, emb2)\n",
    "\n",
    "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n",
    "print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n",
    "print(\"Rotated modelB mean norms:\", torch.norm(emb2_R, dim=1).mean().item())\n",
    "\n",
    "scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2_R, dim=1).mean().item()\n",
    "\n",
    "print(scaling_factor)\n",
    "\n",
    "new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n",
    "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R)\n",
    "\n",
    "\n",
    "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n",
    "\n",
    "modelA.set_input_embeddings(new_embedding)\n",
    "modelA.lm_head.weight = new_embedding.weight\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b671b41",
   "metadata": {},
   "outputs": [],
   "source": [
    "# optionally swap the position encoding weights.\n",
    "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85957357",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check if the output shape matches the tokenizer vocab size (it should)\n",
    "modelA.lm_head.out_features == tokenizerA.vocab_size"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6b39638",
   "metadata": {},
   "source": [
    "Text:"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbfa8d62",
   "metadata": {},
   "source": [
    "With it:\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "998a0ed6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# make extra sure the input and output weights are tied (if you don't trust that they are in your model)\n",
    "# usually this should change nothing: they are often even the same object if they are tied.\n",
    "# plus we already set this explicitly before, even if it was probably not needed.\n",
    "# modelA.lm_head.weight = modelA.get_input_embeddings().weight \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa8b7ca4",
   "metadata": {},
   "source": [
    "Text:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "d8d9d612",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cpu\n",
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'generated_text': 'Hello, how are you? the he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he he'}]\n"
     ]
    }
   ],
   "source": [
    "# use model\n",
    "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n",
    "print(pipe(\"Hello, how are you?\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc72ea8a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7673a5e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79616f5c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra check that input and output weights match\n",
    "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fc76499",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra check that input and output weights match\n",
    "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d51d201",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
      "        11341,   389,   262,   976])\n"
     ]
    }
   ],
   "source": [
    "# print the token IDs for the given string -> turns out these gpt2 and neo use the same tokenizer\n",
    "# thats why swapping them in the pipeline had no effect at all (by itself should break model unless identical) \n",
    "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
    "print(tok.input_ids[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2e76534a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([ 1212,   318,   257,  1332,   290,  1312,  4240,  1521,   262, 11241,\n",
      "        11341,   389,   262,   976])\n"
     ]
    }
   ],
   "source": [
    "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n",
    "print(tok.input_ids[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a44c465a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "381c712f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "153995fe",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}