{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e28fb85c", "metadata": {}, "outputs": [], "source": [ "# %pip install gputil\n", "# %pip install setuptools\n", "# %pip install transformers\n", "# %pip install torch\n", "\n", "# %pip install auto-gptq #==0.4.0" ] }, { "cell_type": "code", "execution_count": 1, "id": "0667e71a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/mick/pycharmprojects/Frankenstein/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import GPUtil\n", "\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n", "import torch\n", "# from auto_gptq import AutoGPTQForCausalLM" ] }, { "cell_type": "code", "execution_count": 7, "id": "0273f299", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No GPU detected on this system.\n" ] } ], "source": [ "gpus = GPUtil.getGPUs()\n", "if not gpus:\n", " print(\"No GPU detected on this system.\")\n", "else:\n", " for gpu in gpus:\n", " print(f\"GPU Name: {gpu.name}\")\n", " print(f\"Total VRAM: {gpu.memoryTotal} MB\")\n", " print(f\"Free VRAM: {gpu.memoryFree} MB\")\n", " print(f\"Used VRAM: {gpu.memoryUsed} MB\")\n", " print(\"-\" * 40)" ] }, { "cell_type": "code", "execution_count": 2, "id": "67d7e006", "metadata": {}, "outputs": [], "source": [ "def grab_model(model_name, quantized = False):\n", " if quantized:\n", " model = AutoGPTQForCausalLM.from_quantized(model_name, device=\"cpu\", use_safetensors=True)\n", " else:\n", " model = AutoModelForCausalLM.from_pretrained(model_name)\n", "\n", " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " return model, tokenizer" ] }, { "cell_type": "code", "execution_count": 3, "id": "153e9ff5", "metadata": {}, "outputs": [], "source": [ "modelA, tokenizerA = grab_model(\"gpt2\")\n", "modelB, tokenizerB = grab_model(\"EleutherAI/gpt-neo-125M\")\n", "\n", "# modelA, tokenizerA = grab_model(\"EleutherAI/gpt-neo-125M-4bit\", quantized=True)\n", "# modelB, tokenizerB = grab_model(\"iproskurina/opt-125m-GPTQ-4bit-g128\", quantized=True)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1da291ed", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelA.config.hidden_size == modelB.config.hidden_size " ] }, { "cell_type": "code", "execution_count": 60, "id": "dcfc2d85", "metadata": {}, "outputs": [], "source": [ "# replace tokenizer:\n", "# modelA.Tokenizer = tokenizerB # optional when not accessing directly \n", "\n", "# replace token embeddings for input and output:\n", "# modelA.set_input_embeddings(modelB.get_input_embeddings())\n", "# modelA.lm_head.weight = modelB.get_input_embeddings().weight\n", "# modelA.resize_token_embeddings(tokenizerB.vocab_size)\n", "\n", "# modelA.transformer.wpe.weight = modelB.transformer.wpe.weight\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1011d3ad", "metadata": {}, "outputs": [], "source": [ "# emb1 = modelA.get_input_embeddings().weight\n", "# emb2 = modelB.get_input_embeddings().weight\n", "\n", "# print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", "# print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", "\n", "# scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n", "\n", "# print(scaling_factor)\n", "\n", "# new_embedding = torch.nn.Embedding.from_pretrained(emb2*scaling_factor)\n", "\n", "# print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", "\n", "# modelA.set_input_embeddings(new_embedding)\n", "# modelA.lm_head.weight = new_embedding.weight\n" ] }, { "cell_type": "code", "execution_count": 113, "id": "c62b2f41", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor(False)\n", "tensor(22.2842, grad_fn=)\n", "tensor(11.5013, grad_fn=)\n" ] } ], "source": [ "print(torch.isnan(modelB.get_input_embeddings().weight).any())\n", "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).max())\n", "print(torch.norm(modelB.get_input_embeddings().weight, dim=1).mean())" ] }, { "cell_type": "code", "execution_count": 4, "id": "2b9893a3", "metadata": {}, "outputs": [], "source": [ "def check_orthogonal(R):\n", " I = torch.eye(R.size(0), device=R.device)\n", " delta = torch.norm(R.T @ R - I)\n", " print(f\"Delta: {delta:.6e}\")\n", " " ] }, { "cell_type": "code", "execution_count": 5, "id": "e1a54c24", "metadata": {}, "outputs": [], "source": [ "# use proscrustes:\n", "def procrustes(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n", " # A_centered = A - A.mean(dim=0, keepdim=True)\n", " # B_centered = B - B.mean(dim=0, keepdim=True)\n", "\n", " #M = B_centered.T @ A_centered\n", " M = B.T @ A\n", " # find optimal rotation with svd\n", " U, _, Vt = torch.linalg.svd(M)\n", "\n", " # get rotation matrix that aligns B to A\n", " R = U @ Vt\n", "\n", " check_orthogonal(R)\n", " \n", " return B @ R # return rotated tensor\n", "\n", "def get_rotated_matrix(A, B, n = 1000):\n", " # use only the first n tokens for rotation:\n", " # return procrustes(A[:n], B[:n])\n", " return procrustes(A, B)\n", " \n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "ff93495e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ModelA mean norms: 1.3624546527862549\n", "ModelB mean norms: 11.50130844116211\n", "0.1184608394563315\n", "new_embedding mean norms: 11.50130844116211\n" ] } ], "source": [ "emb1 = modelA.get_input_embeddings().weight\n", "emb2 = modelB.get_input_embeddings().weight\n", "\n", "# emb1_R = get_rotated_matrix(emb2, emb1)\n", "\n", "print(\"ModelA mean norms:\", torch.norm(emb1, dim=1).mean().item())\n", "print(\"ModelB mean norms:\", torch.norm(emb2, dim=1).mean().item())\n", "# print(\"Rotated modelB mean norms:\", torch.norm(emb1_R, dim=1).mean().item())\n", "\n", "scaling_factor = torch.norm(emb1, dim=1).mean().item() / torch.norm(emb2, dim=1).mean().item()\n", "\n", "print(scaling_factor)\n", "\n", "# new_embedding = torch.nn.Embedding.from_pretrained(emb2_R*scaling_factor)\n", "new_embedding = torch.nn.Embedding.from_pretrained(emb1/scaling_factor)\n", "\n", "\n", "print(\"new_embedding mean norms:\", torch.norm(new_embedding.weight, dim=1).mean().item())\n", "\n", "modelA.set_input_embeddings(new_embedding)\n", "modelA.lm_head.weight = new_embedding.weight\n" ] }, { "cell_type": "code", "execution_count": 107, "id": "9b671b41", "metadata": {}, "outputs": [], "source": [ "modelA.transformer.wpe.weight = modelB.transformer.wpe.weight" ] }, { "cell_type": "code", "execution_count": 109, "id": "85957357", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelA.lm_head.out_features == tokenizerA.vocab_size" ] }, { "cell_type": "markdown", "id": "f6b39638", "metadata": {}, "source": [ "Text:" ] }, { "cell_type": "markdown", "id": "fbfa8d62", "metadata": {}, "source": [ "With it:\n" ] }, { "cell_type": "code", "execution_count": null, "id": "998a0ed6", "metadata": {}, "outputs": [], "source": [ "# modelA.lm_head.weight = modelA.get_input_embeddings().weight # should change nothing: they are the same object.\n" ] }, { "cell_type": "markdown", "id": "aa8b7ca4", "metadata": {}, "source": [ "Text:" ] }, { "cell_type": "code", "execution_count": 12, "id": "d8d9d612", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Device set to use cpu\n", "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[{'generated_text': 'Hello, how are you?\\n\\nYou are not a new.\\n\\nYou are a new.\\n\\n\\nYou are not a new.\\n\\n\\nYou are not a new.\\n\\n\\na new.\\n\\na new.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na.\\n\\na\\n\\na.\\n\\na\\n\\n.\\na\\n\\na\\n\\na\\n\\n.\\na\\n\\na\\n\\n.\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n.\\n\\na\\n\\n.\\n\\na\\n\\n.\\n\\n'}]\n" ] } ], "source": [ "# use model\n", "pipe = pipeline(\"text-generation\", model=modelA, tokenizer=tokenizerB)\n", "print(pipe(\"Hello, how are you?\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "fc72ea8a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d7673a5e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 26, "id": "79616f5c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelA.lm_head.weight.data_ptr() == modelA.get_input_embeddings().weight.data_ptr()" ] }, { "cell_type": "code", "execution_count": 27, "id": "7fc76499", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelB.lm_head.weight.data_ptr() == modelB.get_input_embeddings().weight.data_ptr()" ] }, { "cell_type": "code", "execution_count": 10, "id": "7d51d201", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", " 11341, 389, 262, 976])\n" ] } ], "source": [ "tok = tokenizerA(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", "print(tok.input_ids[0])" ] }, { "cell_type": "code", "execution_count": 11, "id": "2e76534a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([ 1212, 318, 257, 1332, 290, 1312, 4240, 1521, 262, 11241,\n", " 11341, 389, 262, 976])\n" ] } ], "source": [ "tok = tokenizerB(\"This is a test and i wonder why the tokenizers are the same\", return_tensors = \"pt\")\n", "print(tok.input_ids[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "a44c465a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "381c712f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "153995fe", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }