|
108 | 108 | "id": "UJJneXpTEg4W"
|
109 | 109 | },
|
110 | 110 | "source": [
|
| 111 | + " \n", |
111 | 112 | "# 1. Convert the GPT model implementation step by step"
|
112 | 113 | ]
|
113 | 114 | },
|
|
129 | 130 | "id": "979c7b6d-1370-4da1-8bfb-a2b27537bf2f"
|
130 | 131 | },
|
131 | 132 | "source": [
|
| 133 | + " \n", |
132 | 134 | "## 1.1 Replace LayerNorm with RMSNorm layer"
|
133 | 135 | ]
|
134 | 136 | },
|
|
228 | 230 | "id": "5eb81f83-c38c-46a4-b763-aa630a32e357"
|
229 | 231 | },
|
230 | 232 | "source": [
|
| 233 | + " \n", |
231 | 234 | "## 1.2 Replace GELU with SiLU activation"
|
232 | 235 | ]
|
233 | 236 | },
|
|
300 | 303 | "id": "4f9b5167-1da9-46c8-9964-8036b3b1deb9"
|
301 | 304 | },
|
302 | 305 | "source": [
|
| 306 | + " \n", |
303 | 307 | "## 1.3 Update the FeedForward module"
|
304 | 308 | ]
|
305 | 309 | },
|
|
388 | 392 | "id": "f6b7bf4f-99d0-42c1-807c-5074d2cc1949"
|
389 | 393 | },
|
390 | 394 | "source": [
|
| 395 | + " \n", |
391 | 396 | "## 1.4 Implement RoPE"
|
392 | 397 | ]
|
393 | 398 | },
|
|
503 | 508 | "id": "f78127b0-dda2-4c5a-98dd-bae8f5fe8297"
|
504 | 509 | },
|
505 | 510 | "source": [
|
| 511 | + " \n", |
506 | 512 | "## 1.5 Add RoPE to MultiHeadAttention module"
|
507 | 513 | ]
|
508 | 514 | },
|
|
652 | 658 | "id": "e5a1a272-a038-4b8f-aaaa-f4b241e7f23f"
|
653 | 659 | },
|
654 | 660 | "source": [
|
| 661 | + " \n", |
655 | 662 | "## 1.6 Update the TransformerBlock module"
|
656 | 663 | ]
|
657 | 664 | },
|
|
727 | 734 | "id": "ada953bc-e2c0-4432-a32d-3f7efa3f6e0f"
|
728 | 735 | },
|
729 | 736 | "source": [
|
| 737 | + " \n", |
730 | 738 | "## 1.7 Update the model class"
|
731 | 739 | ]
|
732 | 740 | },
|
|
791 | 799 | "id": "4bc94940-aaeb-45b9-9399-3a69b8043e60"
|
792 | 800 | },
|
793 | 801 | "source": [
|
| 802 | + " \n", |
794 | 803 | "## 2. Initialize model"
|
795 | 804 | ]
|
796 | 805 | },
|
|
1029 | 1038 | "id": "5dc64a06-27dc-46ec-9e6d-1700a8227d34"
|
1030 | 1039 | },
|
1031 | 1040 | "source": [
|
| 1041 | + " \n", |
1032 | 1042 | "## 3. Load tokenizer"
|
1033 | 1043 | ]
|
1034 | 1044 | },
|
|
1288 | 1298 | "id": "f63cc248-1d27-4eb6-aa50-173b436652f8"
|
1289 | 1299 | },
|
1290 | 1300 | "source": [
|
| 1301 | + " \n", |
1291 | 1302 | "## 4. Load pretrained weights"
|
1292 | 1303 | ]
|
1293 | 1304 | },
|
|
1544 | 1555 | "print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))"
|
1545 | 1556 | ]
|
1546 | 1557 | },
|
| 1558 | + { |
| 1559 | + "cell_type": "markdown", |
| 1560 | + "id": "d72ed949-b6c0-4966-922f-eb0da732c404", |
| 1561 | + "metadata": {}, |
| 1562 | + "source": [ |
| 1563 | + " \n", |
| 1564 | + "## 5. Using the instruction-finetuned model" |
| 1565 | + ] |
| 1566 | + }, |
1547 | 1567 | {
|
1548 | 1568 | "cell_type": "markdown",
|
1549 | 1569 | "id": "akyo7WNyF_YL",
|
1550 | 1570 | "metadata": {
|
1551 | 1571 | "id": "akyo7WNyF_YL"
|
1552 | 1572 | },
|
1553 | 1573 | "source": [
|
1554 |
| - "- Tip: as mentioned earlier, this is the pretrained base model; if you want to use a model capable of following instructions, use the `\"meta-llama/Llama-2-7b-chat\"` model instead" |
| 1574 | + "- As mentioned earlier, above we used the pretrained base model; if you want to use a model capable of following instructions, use the `\"meta-llama/Llama-2-7b-chat\"` model instead, as shown below" |
1555 | 1575 | ]
|
1556 | 1576 | },
|
1557 | 1577 | {
|
|
1630 | 1650 | "\n",
|
1631 | 1651 | "print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))"
|
1632 | 1652 | ]
|
| 1653 | + }, |
| 1654 | + { |
| 1655 | + "cell_type": "markdown", |
| 1656 | + "id": "0f693da1-a07c-4e1d-af5a-c3923525f1e2", |
| 1657 | + "metadata": {}, |
| 1658 | + "source": [ |
| 1659 | + " \n", |
| 1660 | + "# What's next?" |
| 1661 | + ] |
| 1662 | + }, |
| 1663 | + { |
| 1664 | + "cell_type": "markdown", |
| 1665 | + "id": "fae93739-ca12-46ba-8ca7-7c07c59f669b", |
| 1666 | + "metadata": {}, |
| 1667 | + "source": [ |
| 1668 | + "- This notebook converted the original GPT-2 architecture into a Llama 2 model\n", |
| 1669 | + "- If you are interested in how to convert Llama 2 into Llama 3, Llama 3.1, and Llama 3.2, check out the [converting-llama2-to-llama3.ipynb](converting-llama2-to-llama3.ipynb) notebook" |
| 1670 | + ] |
1633 | 1671 | }
|
1634 | 1672 | ],
|
1635 | 1673 | "metadata": {
|
|
1653 | 1691 | "name": "python",
|
1654 | 1692 | "nbconvert_exporter": "python",
|
1655 | 1693 | "pygments_lexer": "ipython3",
|
1656 |
| - "version": "3.10.6" |
| 1694 | + "version": "3.11.4" |
1657 | 1695 | },
|
1658 | 1696 | "widgets": {
|
1659 | 1697 | "application/vnd.jupyter.widget-state+json": {
|
|
0 commit comments