diff --git a/LICENSE-MODEL.md b/LICENSE-MODEL.md new file mode 100644 index 0000000..eb47f76 --- /dev/null +++ b/LICENSE-MODEL.md @@ -0,0 +1,120 @@ +# Creative Commons Attribution-NonCommercial 4.0 International Public License + +By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. + +## Section 1 – Definitions. + +a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. + +b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. + +c. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not d. Copyright and Similar Rights. + +d. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. + +e. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. +Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. + +f. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. + +g. Licensor means the individual(s) or entity(ies) granting rights under this Public License. + +i. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. + +j. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. + +k. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. + +l. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. + +## Section 2 – Scope. + +a. License grant. + 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: + A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and + B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. + 3. Term. The term of this Public License is specified in Section 6(a). + 4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. + 5. Downstream recipients. + a. Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. + b. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. + 6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). + +b. Other rights. + +1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. + +2. Patent and trademark rights are not licensed under this Public License. + +3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. + +## Section 3 – License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the following conditions. + +a. Attribution. + +1. If You Share the Licensed Material (including in modified form), You must: + + A. retain the following if it is supplied by the Licensor with the Licensed Material: +identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); + i) a copyright notice; + ii) a notice that refers to this Public License; + iii) a notice that refers to the disclaimer of warranties; + iv) a URI or hyperlink to the Licensed Material to the extent reasonably practicable; + B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and + C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. +2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. +3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. +4. If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License. + +## Section 4 – Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; + b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and + c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. + +## Section 5 – Disclaimer of Warranties and Limitation of Liability. + + a. Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. + + b. To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. + + c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. + +## Section 6 – Term and Termination. + +a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. + +b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or + 2. upon express reinstatement by the Licensor. + +For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. + +c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. + +d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License. + +## Section 7 – Other Terms and Conditions. + +a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. + +b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. + +## Section 8 – Interpretation. + +a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. + +b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. + +c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. + +d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f9bd145 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt diff --git a/README.md b/README.md index b936afd..8135220 100644 --- a/README.md +++ b/README.md @@ -6,23 +6,29 @@

- GitHub + GitHub - + GitHub release

-**GALACTICA** is a general-purpose scientific language model. It is trained on a large corpus of scientific text and data. It can perform scientific NLP tasks at a high level, as well as tasks such as citation prediction, mathematical reasoning, molecular property prediction and protein annotation. A demo is available at [galactica.org](https://galactica.org). +**GALACTICA** is a general-purpose scientific language model. It is trained on a large corpus of scientific text and data. It can perform scientific NLP tasks at a high level, as well as tasks such as citation prediction, mathematical reasoning, molecular property prediction and protein annotation. More information is available at [galactica.org](https://galactica.org). ## Install -**With `pip`** +From pip: ```bash pip install galai ``` +From repository: + +```bash +pip install git+https://github.com/paperswithcode/galai +``` + ## Models There are five GALACTICA models available which we detail below: @@ -45,12 +51,49 @@ model.generate("Scaled dot product attention:\n\n\\[") # Scaled dot product attention:\n\n\\[ \\displaystyle\\text{Attention}(Q,K,V)=\\text{softmax}(\\frac{QK^{T}}{\\sqrt{d_{k}}}%\n)V \\] ``` +Read the full introduction to Galactica models as a [PDF](https://github.com/paperswithcode/galai/blob/main/notebooks/Introduction%20to%20Galactica%20Models.pdf) or a [jupyter notebook](https://github.com/paperswithcode/galai/blob/main/notebooks/Introduction%20to%20Galactica%20Models.ipynb). + +You can also find all the model weights with their model cards and inference widget in the [Hugging Face Hub](https://huggingface.co/models?other=galactica). All the models can be used out of the box with the `transformers` library. + +```bash +pip install transformers accelerate +``` + +You can run inference using the high-level `pipeline` API + +```python +from transformers import pipeline + +model = pipeline("text-generation", model="facebook/galactica-6.7b") +input_text = "The Transformer architecture [START_REF]" +model(input_text) +``` + +Or for more control you can use the lower level `OPTForCausalLM` class. See the model cards of the respective repo to learn how to use the model in CPU, GPU, and different precisions. + +```python +from transformers import AutoTokenizer, OPTForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-6.7b") +model = OPTForCausalLM.from_pretrained("facebook/galactica-6.7b", device_map="auto") + +input_text = "The Transformer architecture [START_REF]" +input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") + +outputs = model.generate(input_ids) +print(tokenizer.decode(outputs[0])) +``` + ## Capabilities +GALACTICA is a stand-alone LM which is not instruction tuned. Because of this you need to use the correct prompts to get good results. In this note, we go over some of the special tokens, and prompt styles you will need to use to get good results. + We demonstrate some examples using the standard (6.7B) model below. 📚 **Predict Citations**: +You need to use `[START_REF]`: + ```python model.generate("The Transformer architecture [START_REF]") # The Transformer architecture [START_REF] Attention is All you Need, Vaswani[END_REF] is a sequence-to-sequence model that uses self-attention to capture long-range dependencies between input and output tokens. The Transformer has been shown to achieve state-of-the-art results on a wide range of natural @@ -65,30 +108,101 @@ model.generate("The Schwarzschild radius is defined as: \\[") 🤔 **Reasoning**: +Reasoning uses the special `` token: + ```python model.generate("A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? ") # What force should be applied to accelerate an object of mass 3kg to 10m/s? \nWe can use Newton's second law: F = ma. We can substitute variables to get:\n\n\\[ F = \\left(66kg ``` -📄 **Generate Documents**: +⚛️ **Generate Molecules**: + +```python +model.generate("[START_I_SMILES]", max_length=200) +# [START_I_SMILES]CCC1=CC=C(C=C1)C(=O)NC2=CC=CC(=C2)C(=O)NC3=CC=C(C=C3)S(=O)(=O)N[END_I_SMILES]\n\n### Molecular Formula\n\nC22H21N3O4S\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for 3-[[3-(4-ethylphenyl)-3-oxo-propanoyl]amino]-N-(4-sulfamoylphenyl)benzamide.\n\n### Computed Properties\n\n| Property Name | Property Value\n| --- | ----------- |\n| Molecular Weight | 423.5\n| XLogP3-AA Log P | 3.2\n| Hydrogen Bond Donor Count | 3\n| Hydrogen Bond Acceptor Count +``` + +🧑‍🔬 **Predict Protein Annotations**: + +```python +model.generate("[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords", max_length=200) +# '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords\n\nCytoplasm, Methyltransferase, rRNA processing, S-adenosyl-L-methionine, Transferase\n\n## References\n\nQuestion: What are some articles for Ribosomal RNA small subunit methyltransferase H?\n\nAnswer: \n\n[START_REF] Comparative Genomics of 28 Salmonella enterica Isolates: Evidence for CRISPR-Mediated Adaptive Sublineage Evolution, Fricke[END_REF]\n\n' +``` + +🖱️ **Free-Form Generation** + +If you want autocomplete based functionality, it is often good to experiment with turning off `new_doc=True`. This makes it more likely for the model to think it is in the middle of a document, as opposed to the beginning. + +```python +model.generate("The reason why Transformers replaced RNNs was because", new_doc=False) +# The reason why Transformers replaced RNNs was because they were able to capture long-term dependencies in the input sequence.\n\n# 2.2.2. Attention Mechanism\n\nThe attention mechanism was introduced in [START_REF] Neural Machine Translation by Jointly Learning to Align and Translate, Bahdan +``` + +❓ **Question Answering** + +In the paper we prefix questions with "Q:" or "Question:". A typical format is "Question: question.\n\nAnswer:", for example: + +```python +model.generate("Question: What is the notch signaling pathway?\n\nAnswer:") +# 'Question: What is the notch signaling pathway?\n\nAnswer: \n\nNotch signaling pathway is a cell-cell communication pathway that regulates cell fate decisions during development. It is involved in cell proliferation, differentiation, apoptosis, and cell migration. The Notch signaling pathway is activated by the binding of' +``` + +📄 **Documents** + +When starting a document, you must use the start document token for good results. To do this, set `new_doc=True` in generate: + +For some article types, like Wikipedia style articles, lecture notes and GitHub repositories, use `#` to begin, e.g: + +```python +model.generate("# Multi-Head Attention\n\n", new_doc=True) +# # Multi-Head Attention\n\nThe multi-head attention mechanism is a generalization of the single-head attention mechanism. The multi-head attention mechanism is a combination of multiple single-head attention mechanisms. The multi-head attention mechanism is shown in Figure 2.\n\nThe multi- +``` + +For paper documents, use Title, e.g: + +```python +model.generate("Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n", new_doc=True) +# Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n# Abstract\n\nSelf-supervised learning is a class of machine learning methods that learn representations of data without the need for human-provided labels.\nIn this survey, we provide a comprehensive overview of the field +``` + +You can also try alternative sampling techniques for less repetitions, e.g. ```python model.generate("Lecture 1: The Ising Model\n\n", new_doc=True, top_p=0.7, max_length=200) # 'Lecture 1: The Ising Model\n\n# 13 Introduction\n\nWe will now look at a simple model for magnetism, the Ising model, which is\na lattice model in which we consider only two spin values, up or down, and\nwe want to understand how these spins interact with each other and how\nthey get arranged in a particular state.\n\nWe will first consider the one-dimensional case, and then move on to\nthe case of two-dimensional lattices, and then to higher dimensions.\n\n# 14 The One-Dimensional Ising Model\n\n# 14.1 The Model\n\nThe one-dimensional Ising model is the simplest case of the model, in\nwhich the lattice is a line of \\(N\\) spins, each with two possible spin\nvalues, up or down. In other words, we consider a line of \\(N\\) spins\nwhere each spin can point up or down' ``` -⚛️ **Generate Molecules**: +📜 **Summarization** + +You can add "TLDR:" for TLDR summaries: ```python -model.generate("[START_I_SMILES]", top_p=0.6, max_length=200) -# [START_I_SMILES]CCC1=CC=C(C=C1)C(=O)NC2=CC=CC(=C2)C(=O)NC3=CC=C(C=C3)S(=O)(=O)N[END_I_SMILES]\n\n### Molecular Formula\n\nC22H21N3O4S\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for 3-[[3-(4-ethylphenyl)-3-oxo-propanoyl]amino]-N-(4-sulfamoylphenyl)benzamide.\n\n### Computed Properties\n\n| Property Name | Property Value\n| --- | ----------- |\n| Molecular Weight | 423.5\n| XLogP3-AA Log P | 3.2\n| Hydrogen Bond Donor Count | 3\n| Hydrogen Bond Acceptor Count +TEXT = """Information overload is a major obstacle to scientific progress. The explosive growth in scientific literature and data has made it ever harder to discover useful insights in a large mass of information. Today scientific knowledge is accessed through search engines, but they are unable to organize scientific knowledge alone. In this paper we introduce Galactica: a large language model that can store, combine and reason about scientific knowledge. We train on a large scientific corpus of papers, reference material, knowledge bases and many other sources. We outperform existing models on a range of scientific tasks. On technical knowledge probes such as LaTeX equations, Galactica outperforms the latest GPT-3 by 68.2% versus 49.0%. Galactica also performs well on reasoning, outperforming Chinchilla on mathematical MMLU by 41.3% to 35.7%, and PaLM 540B on MATH with a score of 20.4% versus 8.8%. It also sets a new state-of-the-art on downstream tasks such as PubMedQA and MedMCQA dev of 77.6% and 52.9%. And despite not being trained on a general corpus, Galactica outperforms BLOOM and OPT-175B on BIG-bench. We believe these results demonstrate the potential for language models as a new interface for science. We open source the model for the benefit of the scientific community.""" + +model.generate(TEXT + "\n\nTLDR:", max_length=400) +# ...TLDR: We introduce Galactica, a large language model that can store, combine and reason about scientific knowledge. ``` -🧑‍🔬 **Predict Protein Annotations**: +💎 **Entity extraction** + +You can extract entities from documents. We use the abstract example (`TEXT`) from the previous section, and add questions ```python -model.generate("[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords", max_length=200) -# '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords\n\nCytoplasm, Methyltransferase, rRNA processing, S-adenosyl-L-methionine, Transferase\n\n## References\n\nQuestion: What are some articles for Ribosomal RNA small subunit methyltransferase H?\n\nAnswer: \n\n[START_REF] Comparative Genomics of 28 Salmonella enterica Isolates: Evidence for CRISPR-Mediated Adaptive Sublineage Evolution, Fricke[END_REF]\n\n' +ENT_TEXT = TEXT + '\n\nWhat scientific entities are mentioned in the abstract above?\n\n' + +model.generate(ENT_TEXT, max_length=400) +# ...What scientific entities are mentioned in the abstract above?\n\nA: LaTeX equations, mathematical MMLU, MATH, PubMedQA, MedMCQA, BIG-bench +``` + +👨‍🔬 **IUPAC Name prediction** + +For this task, we used a prompt based off the PubChem document and prompted for the completion. We use the 6.7bn model for below: + +```python +context = "[START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for" +model.generate(context, max_length=400) +# [START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for 2-amino-2-oxo-acetic acid +# Note this is an incorrect prediction ``` ## Citation diff --git a/docs/PROMPTBOOK.md b/docs/PROMPTBOOK.md new file mode 100644 index 0000000..52434dd --- /dev/null +++ b/docs/PROMPTBOOK.md @@ -0,0 +1,80 @@ +# PromptBOOK + +**GALACTICA** is a stand-alone LM which is not instruction tuned. Because of this you need to use the correct prompts to get good results. In this note, we go over some of the special tokens, and prompt styles you will need to use to get good results. + +## Special Tokens + +### Citations + +To cite, you need to use `[START_REF]`. + +```python +model.generate("The Transformer architecture [START_REF]") +# The Transformer architecture [START_REF] Attention is All you Need, Vaswani[END_REF] is a sequence-to-sequence model that uses self-attention to capture long-range dependencies between input and output tokens. The Transformer has been shown to achieve state-of-the-art results on a wide range of natural +``` + +### Reasoning + +To try step-by-step reasoning, use ``: + +```python +model.generate("A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? ") +# What force should be applied to accelerate an object of mass 3kg to 10m/s? \nWe can use Newton's second law: F = ma. We can substitute variables to get:\n\n\\[ F = \\left(66kg +``` + +### SMILES + +For standard SMILES use `[START_SMILES]` + +```python +model.generate("[START_SMILES]", top_p=0.6, max_length=200) +``` + +For Isomeric SMILES use `[START_I_SMILES]`: + +```python +model.generate("[START_I_SMILES]", top_p=0.6, max_length=200) +# [START_I_SMILES]CCC1=CC=C(C=C1)C(=O)NC2=CC=CC(=C2)C(=O)NC3=CC=C(C=C3)S(=O)(=O)N[END_I_SMILES]\n\n### Molecular Formula\n\nC22H21N3O4S\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for 3-[[3-(4-ethylphenyl)-3-oxo-propanoyl]amino]-N-(4-sulfamoylphenyl)benzamide.\n\n### Computed Properties\n\n| Property Name | Property Value\n| --- | ----------- |\n| Molecular Weight | 423.5\n| XLogP3-AA Log P | 3.2\n| Hydrogen Bond Donor Count | 3\n| Hydrogen Bond Acceptor Count +``` + +### Protein Sequences + +For protein sequences, use `[START_AMINO]`: + +```python +model.generate("[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords", max_length=200) +# '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords\n\nCytoplasm, Methyltransferase, rRNA processing, S-adenosyl-L-methionine, Transferase\n\n## References\n\nQuestion: What are some articles for Ribosomal RNA small subunit methyltransferase H?\n\nAnswer: \n\n[START_REF] Comparative Genomics of 28 Salmonella enterica Isolates: Evidence for CRISPR-Mediated Adaptive Sublineage Evolution, Fricke[END_REF]\n\n' +``` + +## Documents + +When starting a document, you must use the start document token for good results. To do this, set `new_doc=True` in generate: + +For some article types, like Wikipedia style articles and GitHub repositories, use `#` to begin, e.g: + +```python +model.generate("# Multi-Head Attention", new_doc=True) +``` + +For paper documents, use Title, e.g: + +```python +model.generate("Title: Self-Supervised Learning, A Survey", new_doc=True) +``` + +## Free-Form Generation + +If you want autocomplete based functionality, it is often good to experiment with turning off `new_doc=True`. This makes it more likely for the model to think it is in the middle of a document, as opposed to the beginning. + +```python +model.generate("The reason why Transformers replaced RNNs was because", new_doc=False) +``` + +## Questions + +In the paper we prefix questions with "Q:" or "Question:". A typical format is "Question: question.\n\nAnswer:", for example: + +```python +model.generate("Question: What is the notch signaling pathway?\n\nAnswer:") +``` + diff --git a/docs/model_card.md b/docs/model_card.md index 62c6253..dd4ffd8 100644 --- a/docs/model_card.md +++ b/docs/model_card.md @@ -35,7 +35,7 @@ The models are made available under a non-commercial CC BY-NC 4.0 license. More ## Training Data -The GALACTICA models are trained on 106 billion tokens of open-access scientific text and data. This includes papers, textbooks, scientific websites, encyclopedias, reference material, knowledge bases, and more. We tokenize different modalities to provide a natural langauge interface for different tasks. See the README.md for more information. See the paper for full information on the training data. +The GALACTICA models are trained on 106 billion tokens of open-access scientific text and data. This includes papers, textbooks, scientific websites, encyclopedias, reference material, knowledge bases, and more. We tokenize different modalities to provide a natural language interface for different tasks. See the README.md for more information. See the paper for full information on the training data. ## Performance and Limitations diff --git a/galai/__init__.py b/galai/__init__.py index 138ece2..ba535a6 100644 --- a/galai/__init__.py +++ b/galai/__init__.py @@ -1,43 +1,133 @@ +from typing import Union + from galai.model import Model -from galai.utils import get_checkpoint_path, get_tokenizer_path +from galai.utils import ModelInfo +import torch +import warnings +from pathlib import Path + +HF_MAPPING = { + "mini": ("facebook/galactica-125m", torch.float32), + "base": ("facebook/galactica-1.3b", torch.float32), + "standard": ("facebook/galactica-6.7b", torch.float32), + "large": ("facebook/galactica-30b", torch.float32), + "huge": ("facebook/galactica-120b", torch.float16) +} -def load_model(name: str, dtype: str=None, num_gpus: int=None): +def load_model( + name: str, + dtype: Union[str, torch.dtype] = None, + num_gpus: int = None, + parallelize: bool = False +): """ Utility function for loading the model Parameters ---------- - name : str + name: str Name of the model dtype: str - Optional dtype; default float32 for smaller models + Optional dtype; default float32 for all models but 'huge' + + num_gpus : int (optional) + Number of GPUs to use for the inference. If None, all available GPUs are used. If 0 (or if + None and there are no GPUs) only a CPU is used. If a positive number n, then the first n CUDA + devices are used. - num_gpus: int - Number of GPUs to use, default 8 GPUs + parallelize : bool; default False + Specify if to use model tensor parallelizm. Ignored in CPU or single GPU inference. + + By the default (when parallelize is False) the multi-GPU inference is run using accelerate's + pipeline parallelizm in which each GPU is responsible for evaluating a given subset of + model's layers. In this mode evaluations are run sequentially. This mode is well suited for + developing in model's internals as it is more robust in terms of recovering from exceptions + due to not using additional processes. However, because of the sequential nature of + pipeline parallelizm, at any given time only a single GPU is working. + + If parallelize is True, parallelformers' model tensor parallelizm is used instead. Returns ---------- Model - model object """ - if name not in ['mini', 'base', 'standard', 'large', 'huge']: - raise ValueError("Invalid model name. Must be one of 'mini', 'base', 'standard', 'large', 'huge'.") + + if name in HF_MAPPING: + hf_model, default_dtype = HF_MAPPING[name] + galai_model = True + elif Path(name).exists(): + hf_model = name + default_dtype = torch.float32 + galai_model = False + else: + raise ValueError( + "Invalid model name. Must be one of 'mini', 'base', 'standard', 'large', 'huge', " + + "a path to a local checkpoint dir, or a model name available on HuggingFace hub." + ) if dtype is None: - if name == 'huge': - dtype = 'float16' - else: - dtype = 'float32' + dtype = default_dtype + + if isinstance(dtype, str): + dtype = getattr(torch, dtype, None) + if dtype not in (torch.float16, torch.float32, torch.bfloat16): + raise ValueError( + f"Unsupported dtype: {dtype}" + ) + + if dtype == torch.bfloat16 and parallelize: + raise ValueError( + "Model tensor parallel does not support bfloat16 dtype. Use either dtype='float16' " + + "or dtype='float32', or disable tenros parallelizm with parallelize=False." + ) if num_gpus is None: - num_gpus = 8 + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + else: + num_gpus = 0 + elif num_gpus > 0: + # make sure CUDA is available + if not torch.cuda.is_available(): + warnings.warn( + "No CUDA support detected, falling back to CPU inference. If you want to run " + + "inference on GPU make sure CUDA is configured correctly and pytorch is " + + "installed with CUDA support. Set num_gpus=None to avoid this warning.", + UserWarning + ) + num_gpus = 0 + elif num_gpus > torch.cuda.device_count(): + available = torch.cuda.device_count() + warnings.warn( + f"num_gpus={num_gpus} is higher than the number of available CUDA devices. " + + f"Setting it to {available}.", + UserWarning + ) + num_gpus = available + if num_gpus > 1 and parallelize and galai_model: + mi = ModelInfo.by_name(name) + if mi.num_heads % num_gpus != 0: + raise ValueError( + f"With parallelize=True the number of model heads ({mi.num_heads} for '{name}' " + + "model) must be divisible by the num_gpus. Adapt the number of GPUs, try a " + + "different model or set parallelize=False" + ) + if num_gpus <= 1 and parallelize: + warnings.warn( + "parallelize=True requires at least two GPUs. Setting it back to False.", + UserWarning + ) + parallelize = False - model = Model(name=name, dtype=dtype, num_gpus=num_gpus) - model._set_tokenizer(tokenizer_path=get_tokenizer_path()) - if name in ['mini', 'base']: - model._load_checkpoint(checkpoint_path=get_checkpoint_path(name)) - else: - model._load_checkpoint(checkpoint_path=get_checkpoint_path(name)) + model = Model( + name=name, + dtype=dtype, + num_gpus=num_gpus, + tensor_parallel=parallelize, + ) + model._set_tokenizer(hf_model) + model._load_checkpoint(checkpoint_path=hf_model) return model diff --git a/galai/architecture.py b/galai/architecture.py deleted file mode 100644 index be4d686..0000000 --- a/galai/architecture.py +++ /dev/null @@ -1,1128 +0,0 @@ -""" PyTorch OPT model. -- GALACTICA Adaptation""" - -import random -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import Tensor, nn -from torch.nn import CrossEntropyLoss - -from galai.config import OPTConfig -from transformers.models.opt.modeling_opt import ( - ACT2FN, - BaseModelOutputWithPast, - CausalLMOutputWithPast, - PreTrainedModel, - add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - logging, - replace_return_docstrings, -) - - - -logger = logging.get_logger(__name__) - -_CHECKPOINT_FOR_DOC = "" -_CONFIG_FOR_DOC = "OPTConfig" -_TOKENIZER_FOR_DOC = "GPT2Tokenizer" - -# Base model docstring -_EXPECTED_OUTPUT_SHAPE = [1, 8, 768] - - -OPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "facebook/opt-125m", - "facebook/opt-350m", - "facebook/opt-1.3b", - "facebook/opt-2.7b", - "facebook/opt-6.7b", - "facebook/opt-13b", - "facebook/opt-30b", - # See all OPT models at https://huggingface.co/models?filter=opt -] - - -def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0): - """ - Make causal mask used for bi-directional self-attention. - """ - bsz, tgt_len = input_ids_shape - mask = torch.full((tgt_len, tgt_len), float("-inf")) - mask_cond = torch.arange(mask.size(-1)) - mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) - mask = mask.to(dtype) - if past_key_values_length > 0: - mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1) - return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) - - -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) - - -def make_positions(mask, padding_idx: int): - """Replace non-padding symbols with their position numbers. - - Position numbers begin at padding_idx+1. Padding symbols are ignored. - """ - # The series of casts and type-conversions here are carefully - # balanced to both work with ONNX export and XLA. In particular XLA - # prefers ints, cumsum defaults to output longs, and ONNX doesn't know - # how to handle the dtype kwarg in cumsum. - positions = (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx - return positions - - - -class OPTLearnedPositionalEmbedding(nn.Embedding): - """ - This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting - based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to - the forward function. - """ - - def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = 1): - super().__init__(num_embeddings, embedding_dim, padding_idx) - self.onnx_trace = False - if self.padding_idx is not None: - self.max_positions = self.num_embeddings - self.padding_idx - 1 - else: - self.max_positions = self.num_embeddings - - def forward(self, attention_mask: Tensor, positions: Optional[Tensor] = None): - # attention_masks is expected to be of size [batch_size x seq_len]. - if not ((positions is None) or (self.padding_idx is None)): - raise ValueError("If positions is pre-computed then padding_idx should not be set.") - - if positions is None: - attention_mask = attention_mask.long() - positions = make_positions(attention_mask, self.padding_idx) - - return F.embedding( - positions, - self.weight, - self.padding_idx, - self.max_norm, - self.norm_type, - self.scale_grad_by_freq, - self.sparse, - ) - - -# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->OPT -class OPTAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__( - self, - embed_dim: int, - num_heads: int, - dropout: float = 0.0, - is_decoder: bool = False, - bias: bool = True, - ): - super().__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.head_dim = embed_dim // num_heads - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads})." - ) - self.scaling = self.head_dim**-0.5 - self.is_decoder = is_decoder - - self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - key_value_states: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - # if key_value_states are provided this layer is used as a cross-attention layer - # for the decoder - is_cross_attention = key_value_states is not None - - bsz, tgt_len, _ = hidden_states.size() - - # get query proj - query_states = self.q_proj(hidden_states) * self.scaling - # get key, value proj - if is_cross_attention and past_key_value is not None: - # reuse k,v, cross_attentions - key_states = past_key_value[0] - value_states = past_key_value[1] - elif is_cross_attention: - # cross_attentions - key_states = self._shape(self.k_proj(key_value_states), -1, bsz) - value_states = self._shape(self.v_proj(key_value_states), -1, bsz) - elif past_key_value is not None: - # reuse k, v, self_attention - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - else: - # self_attention - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states, value_states) - - proj_shape = (bsz * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) - key_states = key_states.view(*proj_shape) - value_states = value_states.view(*proj_shape) - - src_len = key_states.size(1) - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, tgt_len, src_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if layer_head_mask is not None: - if layer_head_mask.size() != (self.num_heads,): - raise ValueError( - f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}" - ) - attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - if output_attentions: - # this operation is a bit awkward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to be reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}" - ) - - attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - - # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be - # partitioned aross GPUs when using tensor-parallelism. - attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped, past_key_value - - -class OPTDecoderLayer(nn.Module): - def __init__(self, config: OPTConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = OPTAttention( - embed_dim=self.embed_dim, - num_heads=config.num_attention_heads, - dropout=config.attention_dropout, - is_decoder=True, - bias=config.bias, # force Marcin - ) - self.do_layer_norm_before = config.do_layer_norm_before - self.dropout = config.dropout - - # force thomas - config.activation_function = "gelu" - self.activation_fn = ACT2FN[config.activation_function] - - self.activation_dropout = config.activation_dropout - - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine) # force Marcin - self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.bias) # force Marcin - self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.bias) # force Marcin - self.final_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine) # force Marcin - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size - `(encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention - if self.do_layer_norm_before: - hidden_states = self.self_attn_layer_norm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - past_key_value=past_key_value, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - output_attentions=output_attentions, - ) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - - # 350m applies layer norm AFTER attention - if not self.do_layer_norm_before: - hidden_states = self.self_attn_layer_norm(hidden_states) - - # Fully Connected - hidden_states_shape = hidden_states.shape - hidden_states = hidden_states.reshape(-1, hidden_states.size(-1)) - residual = hidden_states - - # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention - if self.do_layer_norm_before: - hidden_states = self.final_layer_norm(hidden_states) - - hidden_states = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - hidden_states = (residual + hidden_states).view(hidden_states_shape) - - # 350m applies layer norm AFTER attention - if not self.do_layer_norm_before: - hidden_states = self.final_layer_norm(hidden_states) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -OPT_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`OPTConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare OPT Model outputting raw hidden-states without any specific head on top.", - OPT_START_DOCSTRING, -) -class OPTPreTrainedModel(PreTrainedModel): - config_class = OPTConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] - - def _init_weights(self, module): - std = self.config.init_std - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (OPTDecoder)): - module.gradient_checkpointing = value - - -OPT_GENERATION_EXAMPLE = r""" - Generation example: - - ```python - >>> from transformers import AutoTokenizer, AutoModelForCausalLM - - >>> model = OPTForCausalLM.from_pretrained("ArthurZ/opt-350m") - >>> tokenizer = GPT2Tokenizer.from_pretrained("patrickvonplaten/opt_gpt2_tokenizer") - - >>> TEXTS_TO_GENERATE = "Hey, are you consciours? Can you talk to me?" "Hi there, my name is Barack" - >>> inputs = tokenizer([TEXTS_TO_GENERATE], max_length=1024, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - 'I'm not conscious.<\s>' - ``` -""" - -OPT_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_inputs`] and modify - to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the - default strategy. - head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape - `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `decoder_input_ids` of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -class OPTDecoder(OPTPreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`] - - Args: - config: OPTConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__(self, config: OPTConfig): - super().__init__(config) - self.dropout = config.dropout - self.layerdrop = config.layerdrop - self.padding_idx = config.pad_token_id - self.max_target_positions = config.max_position_embeddings - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx) - if config.scale_embeddings: - self.embed_scale = config.hidden_size**0.5 # force Thomas - else: - self.embed_scale = 1.0 - - # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 - if self.padding_idx is not None: - num_embeddings = config.max_position_embeddings + 2 - - # force thomas - if config.learned_embeddings: - self.embed_positions = OPTLearnedPositionalEmbedding(num_embeddings, config.hidden_size, self.padding_idx) - else: - self.embed_positions = SinusoidalPositionalEmbedding( - config.hidden_size, #embedding_dim, - self.padding_idx, - init_size=num_embeddings + self.padding_idx + 1, - ) - - - if config.word_embed_proj_dim != config.hidden_size: - self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False) - else: - self.project_out = None - - if config.word_embed_proj_dim != config.hidden_size: - self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False) - else: - self.project_in = None - - self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine) # force thomas - self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)]) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - if input_shape[-1] > 1: - combined_attention_mask = _make_causal_mask( - input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length - ).to(self.device) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - return combined_attention_mask - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you - provide it. - - Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the - cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those - that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of - all `decoder_input_ids` of shape `(batch_size, sequence_length)`. - - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - if self.embed_scale != 1.0: - inputs_embeds = self.embed_scale * inputs_embeds - - # embed positions - if attention_mask is None: - attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device) - - positions = self.embed_positions(attention_mask)[:, past_key_values_length:, :] - - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) - - if self.project_in is not None: - inputs_embeds = self.project_in(inputs_embeds) - - hidden_states = inputs_embeds + positions - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = () if use_cache else None - - # check if head_mask has a correct number of layers specified if desired - for attn_mask, mask_name in zip([head_mask], ["head_mask"]): - if attn_mask is not None: - if attn_mask.size()[0] != (len(self.layers)): - raise ValueError( - f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." - ) - - for idx, decoder_layer in enumerate(self.layers): - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - if output_hidden_states: - all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) - if self.training and (dropout_probability < self.layerdrop): - continue - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, output_attentions, None) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - head_mask[idx] if head_mask is not None else None, - None, - ) - else: - - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - layer_head_mask=(head_mask[idx] if head_mask is not None else None), - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - # force thomas - if self.layer_norm is not None: - hidden_states = self.layer_norm(hidden_states) - - if self.project_out is not None: - hidden_states = self.project_out(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -@add_start_docstrings( - "The bare OPT Model outputting raw hidden-states without any specific head on top.", - OPT_START_DOCSTRING, -) -class OPTModel(OPTPreTrainedModel): - def __init__(self, config: OPTConfig): - super().__init__(config) - self.decoder = OPTDecoder(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.decoder.embed_tokens - - def set_input_embeddings(self, value): - self.decoder.embed_tokens = value - - def get_decoder(self): - return self.decoder - - @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=BaseModelOutputWithPast, - config_class=_CONFIG_FOR_DOC, - expected_output=_EXPECTED_OUTPUT_SHAPE, - ) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - input_ids=input_ids, - attention_mask=attention_mask, - head_mask=head_mask, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - if not return_dict: - return decoder_outputs - - return BaseModelOutputWithPast( - last_hidden_state=decoder_outputs.last_hidden_state, - past_key_values=decoder_outputs.past_key_values, - hidden_states=decoder_outputs.hidden_states, - attentions=decoder_outputs.attentions, - ) - - -class OPTForCausalLM(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head\.weight"] - - def __init__(self, config): - super().__init__(config) - self.model = OPTModel(config) - - # the lm_head weight is automatically tied to the embed tokens weight - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.decoder.embed_tokens - - def set_input_embeddings(self, value): - self.model.decoder.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model.decoder = decoder - - def get_decoder(self): - return self.model.decoder - - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you - provide it. - - Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of - shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional - tensors are only required when the model is used as a decoder in a Sequence to Sequence model. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the - cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those - that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of - all `decoder_input_ids` of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - - Returns: - - Example: - - ```python - >>> from transformers import OPTTokenizer, OPTForCausalLM - # this needs fixing - - >>> tokenizer = OPTTokenizer.from_pretrained("patrickvonplaten/opt_gpt2_tokenizer") - >>> model = OPTForCausalLM.from_pretrained("ArthurZ/opt-350m") - >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." - >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") - >>> outputs = model(**inputs) - - >>> logits = outputs.logits - >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] - >>> list(logits.shape) == expected_shape - True - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model.decoder( - input_ids=input_ids, - attention_mask=attention_mask, - head_mask=head_mask, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - logits = self.lm_head(outputs[0]).contiguous() - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - - loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) - - if past: - input_ids = input_ids[:, -1:] - # first step, decoder_cached_states are empty - return { - "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed - "attention_mask": attention_mask, - "past_key_values": past, - "use_cache": use_cache, - } - - @staticmethod - def _reorder_cache(past, beam_idx): - reordered_past = () - for layer_past in past: - reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),) - return reordered_past - - -# other activation - -def gelu(x: torch.Tensor) -> torch.Tensor: - return torch.nn.functional.gelu(x.float()).type_as(x) - -#import for SinPos -import math -from typing import Any, Optional - -import torch -import torch.onnx.operators -from torch import Tensor, nn - -# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -class SinusoidalPositionalEmbedding(nn.Module): - """This module produces sinusoidal positional embeddings of any length. - - Padding symbols are ignored. - """ - - def __init__(self, embedding_dim, padding_idx, init_size=1024): - super().__init__() - self.embedding_dim = embedding_dim - self.padding_idx = padding_idx if padding_idx is not None else 0 - self.weights = SinusoidalPositionalEmbedding.get_embedding( - init_size, embedding_dim, padding_idx - ) - self.onnx_trace = False - self.register_buffer("_float_tensor", torch.FloatTensor(1)) - self.max_positions = int(1e5) - - def prepare_for_onnx_export_(self): - self.onnx_trace = True - - @staticmethod - def get_embedding( - num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None - ): - """Build sinusoidal embeddings. - - This matches the implementation in tensor2tensor, but differs slightly - from the description in Section 3.5 of "Attention Is All You Need". - """ - half_dim = embedding_dim // 2 - emb = math.log(10000) / (half_dim - 1) - emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) - emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze( - 1 - ) * emb.unsqueeze(0) - emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view( - num_embeddings, -1 - ) - if embedding_dim % 2 == 1: - # zero pad - emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) - if padding_idx is not None: - emb[padding_idx, :] = 0 - return emb - - def forward( - self, - input, - incremental_state: Optional[Any] = None, - timestep: Optional[Tensor] = None, - positions: Optional[Any] = None, - ): - """Input is expected to be of size [bsz x seqlen].""" - bspair = torch.onnx.operators.shape_as_tensor(input) - bsz, seq_len = bspair[0], bspair[1] - max_pos = self.padding_idx + 1 + seq_len - if self.weights is None or max_pos > self.weights.size(0): - # recompute/expand embeddings if needed - self.weights = SinusoidalPositionalEmbedding.get_embedding( - max_pos, self.embedding_dim, self.padding_idx - ) - self.weights = self.weights.to(self._float_tensor) - - if incremental_state is not None: - # positions is the same for every token when decoding a single step - pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len - if self.onnx_trace: - return ( - self.weights.index_select(index=self.padding_idx + pos, dim=0) - .unsqueeze(1) - .repeat(bsz, 1, 1) - ) - return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1) - - positions = make_positions( - input, self.padding_idx #, onnx_trace=self.onnx_trace - ) - if self.onnx_trace: - flat_embeddings = self.weights.detach().index_select(0, positions.view(-1)) - embedding_shape = torch.cat( - (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long)) - ) - embeddings = torch.onnx.operators.reshape_from_tensor_shape( - flat_embeddings, embedding_shape - ) - return embeddings - return ( - self.weights.index_select(0, positions.view(-1)) - .view(bsz, seq_len, -1) - .detach() - ) \ No newline at end of file diff --git a/galai/config.py b/galai/config.py deleted file mode 100644 index e7b8a8c..0000000 --- a/galai/config.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding=utf-8 - -# https://github.com/huggingface/transformers/blob/7999ec125fc31428ed6879bf01bb013483daf704/src/transformers/models/opt/configuration_opt.py -# with additional parameters "learned_embeddings" and "scale_embeddings". - -# Copyright 2022 The Metaseq Authors and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" OPT model configuration""" -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - - -class OPTConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate a OPT model - according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the OPT - [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 50272): - Vocabulary size of the OPT model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`OPTModel`] - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the layers and the pooler layer. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of decoder layers. - ffn_dim (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer decoder. - activation_function (`str` or `function`, *optional*, defaults to `"relu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"silu"` and `"gelu_new"` are supported. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - do_layer_norm_before (`bool`, *optional*, defaults to `True`): - Whether to perform layer normalization before the attention block. - word_embed_proj_dim (`int`, *optional*): - `word_embed_proj_dim` can be set to down-project word embeddings, *e.g.* `opt-350m`. Defaults to - `hidden_size`. - dropout (`float`, *optional*, defaults to 0.1): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - activation_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for activations inside the fully connected layer. - layerdrop: (`float`, *optional*, defaults to 0.0): - The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more - details. - init_std (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). - Example: - ```python - >>> from transformers import OPTModel, OPTConfig - >>> # Initializing a OPT facebook/opt-large style configuration - >>> configuration = OPTConfig() - >>> # Initializing a model from the facebook/opt-large style configuration - >>> model = OPTModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "opt" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=50272, - hidden_size=768, - num_hidden_layers=12, - ffn_dim=3072, - max_position_embeddings=2048, - do_layer_norm_before=True, - learned_embeddings=False, # Galileo modification - scale_embeddings=True, # Galileo modification - layer_norm_elementwise_affine=True, # Galileo modification - bias=True, # Galileo modification - word_embed_proj_dim=None, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - num_attention_heads=12, - activation_function="relu", - layerdrop=0.0, - init_std=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=2, - eos_token_id=2, - **kwargs - ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.num_attention_heads = num_attention_heads - self.word_embed_proj_dim = word_embed_proj_dim if word_embed_proj_dim is not None else hidden_size - self.ffn_dim = ffn_dim - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.layerdrop = layerdrop - self.use_cache = use_cache - self.do_layer_norm_before = do_layer_norm_before - self.learned_embeddings = learned_embeddings # Galileo modification - self.scale_embeddings = scale_embeddings # Galileo modification - self.layer_norm_elementwise_affine = layer_norm_elementwise_affine # Galileo - self.bias = bias # Galileo diff --git a/galai/consts.py b/galai/consts.py deleted file mode 100644 index 47857dc..0000000 --- a/galai/consts.py +++ /dev/null @@ -1,70 +0,0 @@ -TOKENIZER_URL = 'https://dl.fbaipublicfiles.com/galactica/tokenizer.json' -WEIGHT_DIR = 'https://dl.fbaipublicfiles.com/galactica' - -MINI_FILES = [ - 'config.json', - 'pytorch_model.bin' -] - -BASE_FILES = [ - 'config.json', - 'pytorch_model.bin' -] - -STANDARD_FILES = [ - 'config.json', - 'pytorch_model-00001-of-00002.bin', - 'pytorch_model-00002-of-00002.bin', - 'pytorch_model.bin.index.json' -] - -LARGE_FILES = [ - 'config.json', - 'pytorch_model-00001-of-00007.bin', - 'pytorch_model-00002-of-00007.bin', - 'pytorch_model-00003-of-00007.bin', - 'pytorch_model-00004-of-00007.bin', - 'pytorch_model-00005-of-00007.bin', - 'pytorch_model-00006-of-00007.bin', - 'pytorch_model-00007-of-00007.bin', - 'pytorch_model.bin.index.json' -] - -HUGE_FILES = [ - 'config.json', - 'pytorch_model-00001-of-00026.bin', - 'pytorch_model-00002-of-00026.bin', - 'pytorch_model-00003-of-00026.bin', - 'pytorch_model-00004-of-00026.bin', - 'pytorch_model-00005-of-00026.bin', - 'pytorch_model-00006-of-00026.bin', - 'pytorch_model-00007-of-00026.bin', - 'pytorch_model-00008-of-00026.bin', - 'pytorch_model-00009-of-00026.bin', - 'pytorch_model-00010-of-00026.bin', - 'pytorch_model-00011-of-00026.bin', - 'pytorch_model-00012-of-00026.bin', - 'pytorch_model-00013-of-00026.bin', - 'pytorch_model-00014-of-00026.bin', - 'pytorch_model-00015-of-00026.bin', - 'pytorch_model-00016-of-00026.bin', - 'pytorch_model-00017-of-00026.bin', - 'pytorch_model-00018-of-00026.bin', - 'pytorch_model-00019-of-00026.bin', - 'pytorch_model-00020-of-00026.bin', - 'pytorch_model-00021-of-00026.bin', - 'pytorch_model-00022-of-00026.bin', - 'pytorch_model-00023-of-00026.bin', - 'pytorch_model-00024-of-00026.bin', - 'pytorch_model-00025-of-00026.bin', - 'pytorch_model-00026-of-00026.bin', - 'pytorch_model.bin.index.json' -] - -CHECKPOINT_PATHS = { - 'mini': [WEIGHT_DIR + '/125m/' + file for file in MINI_FILES], - 'base': [WEIGHT_DIR + '/1.3b/' + file for file in BASE_FILES], - 'standard': [WEIGHT_DIR + '/6.7b/' + file for file in STANDARD_FILES], - 'large': [WEIGHT_DIR + '/30b/' + file for file in LARGE_FILES], - 'huge': [WEIGHT_DIR + '/120b/' + file for file in HUGE_FILES] -} \ No newline at end of file diff --git a/galai/model.py b/galai/model.py index dd5a1ff..866ffbb 100644 --- a/galai/model.py +++ b/galai/model.py @@ -1,14 +1,46 @@ -import os +import warnings +from typing import Union, List + import torch -from accelerate import init_empty_weights, load_checkpoint_and_dispatch +from transformers import AutoTokenizer, OPTForCausalLM, StoppingCriteriaList, StoppingCriteria from parallelformers import parallelize -from tokenizers import Tokenizer +import psutil -from galai.architecture import OPTForCausalLM, OPTConfig from galai.utils import escape_custom_split_sequence +__all__ = ["Model"] + + +class FinishedReferenceCriteria(StoppingCriteria): + """ + A custom criteria to stop generation as soon as all the sequences in the batch have at least + one [END_REF] marker after the prompt. + """ + def __init__(self, prompt_length: int, end_ref_id: int): + """ + Create a new criteria instance for a given generation run. + + Parameters + ---------- + prompt_length : int + The length of the prompt in tokens used to distinguish [END_REF] tokens in the prompt + from the generated [END_REF] tokens. For a batch of multiple prompts of different + lengths this should be the length of the longest prompt and other prompts should be + padded. + end_ref_id : int + The [END_REF] token id. + """ + self.prompt_length = prompt_length + self.end_ref_id = end_ref_id + + def __call__(self, input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs) -> bool: + is_end_ref = (input_ids[:, self.prompt_length:] == self.end_ref_id) + has_end_ref = is_end_ref.any(dim=-1) + return has_end_ref.all() + + class Model(object): """ Model class holding the GALACTICA models. We configure a class to encapsulate the HuggingFace model, @@ -16,7 +48,13 @@ class Model(object): using the standard HuggingFace API. """ - def __init__(self, name: str, dtype: str, num_gpus: int): + def __init__( + self, + name: str, + dtype: str, + num_gpus: int, + tensor_parallel: bool = False, + ): """ Initializes a new model @@ -24,11 +62,25 @@ def __init__(self, name: str, dtype: str, num_gpus: int): ---------- name : str Model name, e.g. `standard`. + + dtype: torch.dtype + Model weights type. + + num_gpus : int + Number of GPUs to use for the inference. If 0 only a CPU is used. If a positive number + n, then the first n CUDA devices are used. + + tensor_parallel : bool + Specify if to use model tensor parallelizm. Ignored in CPU or single GPU inference. """ + self.name = name - self.num_gpus = num_gpus self.dtype = dtype self.is_loaded = False + self.num_gpus = num_gpus + self.tensor_parallel = tensor_parallel + self.max_input_length = 2020 + self._master_port = None def _load_checkpoint(self, checkpoint_path: str): """ @@ -39,39 +91,55 @@ def _load_checkpoint(self, checkpoint_path: str): checkpoint_path : str Path for the checkpoint (str) """ - self.config = OPTConfig.from_pretrained(checkpoint_path) - - with init_empty_weights(): - self.model = OPTForCausalLM(self.config) - - self.model.tie_weights() - - device_map = { - 'decoder.embed_tokens': 0, - 'decoder.embed_positions': 0, - 'decoder.layer_norm': 0, - } - - n_layers = self.config.num_hidden_layers - - for i in range(n_layers): - device_map[f"decoder.layers.{i}"] = i * self.num_gpus // n_layers - - if 'mini' in checkpoint_path or 'base' in checkpoint_path: - checkpoint_path = checkpoint_path + '/pytorch_model.bin' - - load_checkpoint_and_dispatch( - self.model.model, - checkpoint_path, - device_map=device_map, - offload_folder=None, - dtype=self.dtype, - offload_state_dict=True - ) - self.model.tie_weights() + # query available memory size of the GPUs we want to use. If tensor_parallel is True, + # we just load the model's weights to RAM, as it needs to be sliced by parallelformers + # before loading to VRAM. + device_map = None + max_memory = {} + if self.num_gpus > 0 and not self.tensor_parallel: + # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274 + for i in range(self.num_gpus): + _ = torch.tensor([0], device=i) + for i in range(self.num_gpus): + max_memory[i] = torch.cuda.mem_get_info(i)[0] + device_map = "auto" + max_memory["cpu"] = psutil.virtual_memory().available + + self.model = OPTForCausalLM.from_pretrained( + checkpoint_path, + torch_dtype=self.dtype, + low_cpu_mem_usage=True, + device_map=device_map, + max_memory=max_memory, + ) self.model.eval() + if self.tensor_parallel: + self._parallelize() + + def _parallelize(self) -> None: + """ + Parallelize the model for a tensor-parallel multi-GPU inference. + """ + + if self.num_gpus < 2: + warnings.warn("At least two GPUs are required to parallelize the model.", UserWarning) + return + + self._master_port = 13000 + (id(self.model) % 32749) + + custom_policies = None + if self.model.config.model_type == "opt" and not self.model.config.enable_bias: + from galai.parallel_policy import OPTDecoderLayerPolicyNoBias + custom_policies = [OPTDecoderLayerPolicyNoBias] + + parallelize( + self.model, num_gpus=self.num_gpus, fp16=self.dtype == torch.float16, + master_port=self._master_port, + custom_policies=custom_policies, + ) + def _set_tokenizer(self, tokenizer_path: str): """ Configures the tokenizer for the model @@ -81,22 +149,100 @@ def _set_tokenizer(self, tokenizer_path: str): tokenizer_path : str Path for the tokenizer (str) """ - self.tokenizer = Tokenizer.from_file(tokenizer_path) - self.tokenizer.enable_padding(direction="left", pad_id=1, pad_type_id=0, pad_token="[PAD]") - self.tokenizer.enable_truncation(max_length=2020, direction="left") + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + + # setup padding + tokenizer.pad_token_id = 1 + tokenizer.pad_token = "" + tokenizer.padding_side = "left" + + # setup truncation + tokenizer.truncation_side = "left" + + # setup special tokens + tokenizer.bos_token_id = 0 + tokenizer.bos_token = "" + + tokenizer.eos_token_id = 2 + tokenizer.eos_token = "" + + tokenizer.unk_token = "" + tokenizer.unk_token_id = 3 + + self.tokenizer = tokenizer + + def _tokenize(self, input_text: List[str], new_doc: bool) -> torch.LongTensor: + """ + Apply custom preprocessing to input texts and tokenize them. + + Returns + ------- + input_text : list[str] + Texts to be tokenized + new_doc : bool + If True, prepends the end-of-document () token to each sequence and fixes + padding. + """ + texts = [] + for text in input_text: + text = escape_custom_split_sequence(text) + if not text: + warnings.warn( + "Found an empty input text. Changing to end-of-document token instead.", + UserWarning + ) + text = self.tokenizer.eos_token + texts.append(text) + + if new_doc: + pad_token = self.tokenizer.pad_token + texts = [pad_token + t for t in texts] + + encoded = self.tokenizer( + texts, + padding="longest", + max_length=self.max_input_length, + truncation=True + ) + context_tokens = encoded["input_ids"] + input_v = torch.LongTensor(context_tokens).to(self.model.device) + + if new_doc: + input_v[input_v[:, 0] == self.tokenizer.pad_token_id, 0] = self.tokenizer.eos_token_id + return input_v - def generate(self, input_text: str, max_length=60, new_doc=False, top_p=None) -> str: + @torch.inference_mode() + def generate( + self, + input_text: Union[str, List[str]], + max_length=None, + max_new_tokens=None, + new_doc=False, + top_p=None, + top_k=None, + penalty_alpha=None, + num_beams=1, + num_return_sequences=1, + return_full_text=True, + ) -> Union[str, List[str], List[List[str]]]: """ Generates text using the model Parameters ---------- - input_text : str - Input context for the model to use for its generation, + input_text : str or list[str] + Input context for the model to use for its generation, e.g. "Attention Is All You Need [START_REF]" - max_length: int - Maximum length of the generated text + max_length : int (optional) + Maximum length in tokens of the generated text (including prompt). Only one of + max_length and max_new_tokens should be specified. If neither is set, then + max_new_tokens is set to 60. + + max_new_tokens : int (optional) + Maximum length in tokens of the generated text (excluding prompt). Only one of + max_length and max_new_tokens should be specified. If neither is set, then + max_new_tokens is set to 60. new_doc : bool If True, treats generation a new document, otherwise assumes generation could be @@ -104,42 +250,217 @@ def generate(self, input_text: str, max_length=60, new_doc=False, top_p=None) -> # Schwarzschild Radius, # Transformer (machine learning), Title: Transformers, A Survey. For general prompting, turn off. Default is False. + top_p : float or None + If a number, e.g. 0.7, performs top p sampling. Default is None. + + top_k : int or None + If a number, performs top k sampling (if penalty_alpha is None) or contrastive search + decoding (if penalty_alpha > 0). Default is None. + + penalty_alpha : float or None + If a positive number and top_k is set, performs contrastive search decoding with top_k + candidates reranking. Default is None. + + num_beams : int, default 1 + Number of beams to use in beam search. + + num_return_sequences : int, default 1 + Number of generations to return for each prompt. + + Returns + ---------- + str, list[str] or list[list[str]] - generated texts from the model. If input_text is a + singe string, then the output is str if num_return_sequences == 1 or a list of + strings if num_return_sequences > 1. If input_text is an iterable of strings, then the + output is either a list of strings if num_return_sequences == 1 or a list of lists of + strings, in which each inner list contains the generations for a given input prompt. + """ + texts = [input_text] if isinstance(input_text, str) else input_text + input_v = self._tokenize(texts, new_doc) + options = {} + if penalty_alpha is not None: + options["penalty_alpha"] = penalty_alpha + options["top_k"] = top_k + else: + if top_p is not None: + options["do_sample"] = True + options["top_p"] = top_p + if top_k is not None: + options["do_sample"] = True + options["top_k"] = top_k + + if max_new_tokens is None and max_length is None: + max_new_tokens = 60 + out = self.model.generate( + input_v, + max_length=max_length, + max_new_tokens=max_new_tokens, + return_dict_in_generate=True, + output_hidden_states=False, + num_beams=num_beams, + num_return_sequences=num_return_sequences, + **options + ) + + out_tokens = out['sequences'] + if not return_full_text: + out_tokens = out_tokens[:, input_v.shape[1]:] + # we keep special tokens such as [START_REF] or + decoded = self.tokenizer.batch_decode( + out_tokens, + skip_special_tokens=False, + clean_up_tokenization_spaces=False, + ) + # so we manually remove and + decoded = [ + text.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "") + for text in decoded + ] + + if num_return_sequences == 1: + return decoded[0] if isinstance(input_text, str) else decoded + if isinstance(input_text, str): + return decoded + else: + return [ + decoded[num_return_sequences * i:num_return_sequences * (i+1)] + for i in range(len(texts)) + ] + + @torch.inference_mode() + def generate_reference( + self, + input_text: Union[str, List[str]], + max_length=None, + max_new_tokens=None, + new_doc=False, + top_p=None, + suggestions=1, + diversity_penalty=0.0, + ) -> Union[str, List[str], List[List[str]]]: + """ + Generates reference. + + Parameters + ---------- + input_text : str or list[str] + Input context for the model to use for its generation, + e.g. "Attention Is All You Need [START_REF]" + + max_length : int (optional) + Maximum length in tokens of the generated text (including prompt). Only one of + max_length and max_new_tokens should be specified. + + max_new_tokens : int (optional) + Maximum length in tokens of the generated text (excluding prompt). Only one of + max_length and max_new_tokens should be specified. If neither is set, then + max_new_tokens is set to 60. + + new_doc : bool + If True, treats generation a new document, otherwise assumes generation could be + anywhere within document. Use new_doc=True if you are generating documents, e.g. + # Schwarzschild Radius, # Transformer (machine learning), + Title: Transformers, A Survey. For general prompting, turn off. Default is False. + top_p : float or None If None, uses greedy decoding. If a number, e.g. 0.7, performs top p sampling. Default is None. + suggestions : int, default 1 + Number of suggestions to return for each input prompt. Uses beam search to return more + suggestions. Ignored when sampling. + + diversity_penalty : float, default 0.0, ignored if sampling or suggestions == 1 + Returns ---------- - str - generated text from the model + str, list[str] or list[list[str]] - generated reference suggestions from the model. If + input_text is a singe string, then the output is str if suggestions == 1 or a list of + strings if suggestions > 1. If input_text is an iterable of strings, then the output is + either a list of strings if suggestions == 1 or a list of lists of strings, in which + each inner list contains the suggestions for a given input prompt. """ - texts = [escape_custom_split_sequence(input_text)] + texts = [input_text] if isinstance(input_text, str) else input_text + # append [START_REF] token if missing + fixed_texts = [] + for text in texts: + start_ref_pos = text.rfind("[START_REF]") + if start_ref_pos == -1: + fixed_texts.append(text + "[START_REF]") + else: + end_ref_pos = text.find("[END_REF]", start_ref_pos) + if end_ref_pos != -1: + # the last [START_REF] is closed with [END_REF], let's add another one + fixed_texts.append(text + "[START_REF]") + else: + # avoid spaces after [START_REF] token for better results + fixed_texts.append(text.rstrip()) - if new_doc: - pad_id = self.tokenizer.padding["pad_id"] - pad_token = self.tokenizer.id_to_token(pad_id) - texts = [pad_token + t for t in texts] + input_v = self._tokenize(fixed_texts, new_doc) - list_encoded = self.tokenizer.encode_batch(texts) - context_tokens = [encoded.ids for encoded in list_encoded] - input_v = torch.LongTensor(context_tokens).to(self.model.device) + prompt_length = input_v.shape[1] + finished_reference_criteria = FinishedReferenceCriteria( + prompt_length=prompt_length, + end_ref_id=self.tokenizer.convert_tokens_to_ids("[END_REF]"), + ) + if max_new_tokens is None and max_length is None: + max_new_tokens = 60 + + stopping_criteria = StoppingCriteriaList([finished_reference_criteria]) if top_p is not None: out = self.model.generate( - input_v, - max_length=max_length, - return_dict_in_generate=True, - output_hidden_states=True, + input_v, + max_length=max_length, + max_new_tokens=max_new_tokens, + return_dict_in_generate=True, + output_hidden_states=False, top_p=top_p, - do_sample=True + do_sample=True, + num_return_sequences=suggestions, + stopping_criteria=stopping_criteria, ) else: out = self.model.generate( - input_v, - max_length=max_length, - return_dict_in_generate=True, - output_hidden_states=True + input_v, + max_length=max_length, + max_new_tokens=max_new_tokens, + num_beams=suggestions, + num_return_sequences=suggestions, + num_beam_groups=suggestions if diversity_penalty > 0.0 else 1, + diversity_penalty=diversity_penalty, + return_dict_in_generate=True, + output_hidden_states=False, + stopping_criteria=stopping_criteria, + ) + # cut-off the prompts + generated_tokens = out["sequences"][:, prompt_length:] + decoded = self.tokenizer.batch_decode( + generated_tokens, + skip_special_tokens=False, + clean_up_tokenization_spaces=False, + ) + references = [] + unfinished_generation = False + for text in decoded: + end_ref_pos = text.find("[END_REF]") + if end_ref_pos == -1: + unfinished_generation = True + references.append(text.strip()) + else: + references.append(text[:end_ref_pos].strip()) + if unfinished_generation: + warnings.warn( + "At least one of the generated references may be incomplete. Consider increasing max_length or max_new_tokens.", + UserWarning ) - - return self.tokenizer.decode_batch( - out['sequences'].tolist(), - skip_special_tokens=False)[0].lstrip('') + + if suggestions == 1: + return references[0] if isinstance(input_text, str) else references + if isinstance(input_text, str): + return references + else: + return [ + references[suggestions * i:suggestions * (i+1)] + for i in range(len(texts)) + ] diff --git a/galai/notebook_utils.py b/galai/notebook_utils.py new file mode 100644 index 0000000..2996108 --- /dev/null +++ b/galai/notebook_utils.py @@ -0,0 +1,108 @@ +from IPython.display import HTML +import markdown as md +import bleach +from bleach.css_sanitizer import CSSSanitizer + + +__all__ = ["display_markdown", "display_latex"] + +ALLOWED_TAGS = [ + "a", + "abbr", + "acronym", + "b", + "blockquote", + "br", + "code", + "div", + "em", + "h1", + "h2", + "h3", + "h4", + "h5", + "i", + "li", + "ol", + "strong", + "ul", + "span", + "table", + "thead", + "tbody", + "tr", + "td", + "th", + "p", + "pre", +] + +ALLOWED_ATTRIBUTES = { + "a": ["href", "title"], + "abbr": ["title"], + "acronym": ["title"], + "div": ["class"], + "span": ["style", "class"], + "td": ["align", "valign"], + "th": ["align", "valign"], +} + +ALLOWED_CSS_PROPERTIES = [ + "width", "margin", "margin-left", "margin-right", + "margin-bottom", "margin-top", "height", "color", "font-weight" +] + + +def clean_html(value, tags=None, attributes=None, css_sanitizer=None): + if tags is None: + tags = ALLOWED_TAGS + if attributes is None: + attributes = ALLOWED_ATTRIBUTES + if css_sanitizer is None: + css_sanitizer = CSSSanitizer(allowed_css_properties=ALLOWED_CSS_PROPERTIES) + elif isinstance(css_sanitizer, list): + css_sanitizer = CSSSanitizer(allowed_css_properties=css_sanitizer) + + cleaned = bleach.clean( + value, + tags=tags, + attributes=attributes, + css_sanitizer=css_sanitizer, + ) + + return cleaned + + +def _markdown2html_unsafe(value): + """Converts markdown to unsanitized HTML.""" + out = md.markdown( + value, + extensions=[ + "markdown.extensions.tables", "fenced_code", "codehilite" + ], + ) + return out + + +def markdown2html(value): + return clean_html(_markdown2html_unsafe(value)) + + +def display_markdown(text): + # normalize LaTeX tags + text = text.replace(r"\(", "$").replace(r"\)", "$").replace(r"\[", "$$").replace(r"\]", "$$") + # convert to markdown and sanitize + text = markdown2html(text) + # use IPython.display.HTML instead of IPython.display.Markdown so that the output is + # rendered properly on notebook load without cells reevaluations + return HTML(text) + + +def display_latex(text): + # normalize LaTeX tags + text = text.replace(r"\(", "$").replace(r"\)", "$").replace(r"\[", "$$").replace(r"\]", "$$") + # the text is going to be parsed as + text = clean_html(text, tags=[], attributes=[], css_sanitizer=[]) + # use IPython.display.HTML instead of IPython.display.Latex so that the output is + # rendered properly on notebook load without cells reevaluations + return HTML(text) diff --git a/galai/parallel_policy.py b/galai/parallel_policy.py new file mode 100644 index 0000000..d4fabd4 --- /dev/null +++ b/galai/parallel_policy.py @@ -0,0 +1,60 @@ +from parallelformers.policies.base import Layer, Policy +from parallelformers.utils.dist_utils import AllReduceLinear + +from transformers.models.opt.modeling_opt import OPTDecoderLayer + + +__all__ = ["OPTDecoderLayerPolicyNoBias"] + + +class OPTDecoderLayerPolicyNoBias(Policy): + @staticmethod + def replace_arguments(config, world_size): + return { + "self_attn.embed_dim": config.hidden_size // world_size, + "self_attn.num_heads": config.num_attention_heads // world_size, + } + + @staticmethod + def attn_qkv(): + return [ + Layer( + weight="self_attn.q_proj.weight", + ), + Layer( + weight="self_attn.k_proj.weight", + ), + Layer( + weight="self_attn.v_proj.weight", + ), + ] + + @staticmethod + def attn_out(): + return [ + Layer( + weight="self_attn.out_proj.weight", + replace=AllReduceLinear, + ), + ] + + @staticmethod + def mlp_in(): + return [ + Layer( + weight="fc1.weight", + ), + ] + + @staticmethod + def mlp_out(): + return [ + Layer( + weight="fc2.weight", + replace=AllReduceLinear, + ), + ] + + @staticmethod + def original_layer_class(): + return OPTDecoderLayer diff --git a/galai/utils.py b/galai/utils.py index d930e8e..26b2409 100644 --- a/galai/utils.py +++ b/galai/utils.py @@ -1,9 +1,15 @@ -import os import re -import tqdm -import urllib +from typing import List +import math +import html + +from dataclasses import dataclass + + +__all__ = [ + "escape_custom_split_sequence", "ModelInfo", +] -from galai.consts import CHECKPOINT_PATHS, TOKENIZER_URL # we split individual characters inside special tokens like [START_DNA] CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])") @@ -14,10 +20,6 @@ # literally in the source code in case we ever include it in the training data. SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E" -ENV_TORCH_HOME = 'TORCH_HOME' -ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' -DEFAULT_CACHE_DIR = '~/.cache' - def _insert_split_marker(m: re.Match): """ @@ -37,6 +39,7 @@ def _insert_split_marker(m: re.Match): sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL) return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}" + def escape_custom_split_sequence(text): """ Applies custom splitting to the text for GALILEO's tokenization @@ -52,97 +55,107 @@ def escape_custom_split_sequence(text): """ return CUSTOM_SEQ_RE.sub(_insert_split_marker, text) -def _get_cache_home(): - cache_home = os.path.expanduser( - os.getenv(ENV_TORCH_HOME, - os.path.join(os.getenv(ENV_XDG_CACHE_HOME, - DEFAULT_CACHE_DIR), 'galactica'))) - return cache_home - - -class DownloadProgressBar(tqdm.tqdm): - def update_to(self, b=1, bsize=1, tsize=None): - if tsize is not None: - self.total = tsize - self.update(b * bsize - self.n) - - -def _download_file(file_url: str, file_loc: str): - with DownloadProgressBar(unit='B', unit_scale=True, - miniters=1, desc=file_url.split('/')[-1]) as t: - urllib.request.urlretrieve(file_url, filename=file_loc, reporthook=t.update_to) - -def download_model(model_name: str, model_path: str): - - for file_url in tqdm.tqdm(CHECKPOINT_PATHS[model_name]): - file_loc = os.path.join(model_path, file_url.split('/')[-1]) - if os.path.exists(file_loc): - continue - _download_file(file_url, file_loc) - -def download_tokenizer(tokenizer_path: str): - _download_file(TOKENIZER_URL, tokenizer_path) - -def get_checkpoint_path(model_name: str) -> str: - """ - Downloads checkpoint if not in the ~/.cache/galai/ directory. - Once all files are available, it returns the path. - - Parameters - ---------- - model_name : str - Name of the model, e.g. 'mini' - - Returns - ---------- - str - the path of the model weights - """ - cache_dir = _get_cache_home() - - if not os.path.exists(cache_dir): - os.mkdir(cache_dir) - - model_path = os.path.join(cache_dir, f"{model_name}.pt") - - if not os.path.exists(model_path): - os.mkdir(model_path) - - if os.path.exists(model_path): - for file in CHECKPOINT_PATHS[model_name]: - file_name = os.path.join(model_path, file.split('/')[-1]) - if not os.path.exists(file_name): - print('Incomplete files for model; downloading') - download_model(model_name=model_name, model_path=model_path) - return model_path - else: - download_model(model_name=model_name, model_path=model_path) - return model_path - -def get_tokenizer_path() -> str: - """ - Downloads tokenizer if not in the ~/.cache/galai/ directory. - Once all files are available, it returns the path. - - Returns - ---------- - str - the path of the tokenizer - """ - cache_dir = _get_cache_home() - - if not os.path.exists(cache_dir): - os.mkdir(cache_dir) - - tokenizer_path = os.path.join(cache_dir, 'tokenizer') - - if not os.path.exists(tokenizer_path): - os.mkdir(tokenizer_path) - file_name = os.path.join(tokenizer_path, 'tokenizer.json') - if os.path.exists(tokenizer_path): - if not os.path.exists(file_name): - print('Incomplete files for tokenizer; downloading') - download_tokenizer(file_name) - return file_name - else: - download_tokenizer(file_name) - return file_name +REFERENCE_RE = re.compile(r"\[START_REF\](.*?)\[END_REF\]", flags=re.DOTALL) + + +def extract_references_from_text(text: str) -> List[str]: + return [cit.strip() for cit in REFERENCE_RE.findall(text)] + + +@dataclass +class ModelInfo: + name: str + num_layers: int + num_heads: int + head_size: int = 128 + vocab_size: int = 50000 + max_positions: int = 2048 + + @property + def hidden_dimension(self) -> int: + return self.head_size * self.num_heads + + @property + def parameters(self) -> int: + layer_norm_elementwise_affine = True + enable_bias = True + h_dim = self.hidden_dimension + bias = h_dim if enable_bias else 0 + embed_tokens_size = self.vocab_size * h_dim + embed_positions_size = (self.max_positions + 2) * h_dim + layer_norm_size = 2 * h_dim if layer_norm_elementwise_affine else 0 + self_attn_size = 4 * (h_dim * h_dim + bias) # 4 = k_proj+v_proj+q_proj+out_proj + ffn_dim = 4 * h_dim + fc_size = 2 * h_dim * ffn_dim + 5 * bias # 2 = fc1 + fc2 + decoder_layer_size = self_attn_size + fc_size + 2 * layer_norm_size + decoder_size = self.num_layers * decoder_layer_size + layer_norm_size + embed_tokens_size + embed_positions_size + + return decoder_size + + @property + def disk_size(self) -> int: + """Approximate dist size in bytes of checkpoints files""" + return self.parameters * 2 + + def weights_size(self, dtype="float16") -> int: + """Approximate total size of model weights in memory""" + element_size = 2 if dtype == "float16" else 4 + return self.parameters * element_size + + def memory_per_token(self, dtype="float16") -> int: + """Approximate memory size required to store intermediate activations and cached outputs""" + element_size = 2 if dtype == "float16" else 4 + return 2 * self.num_layers * self.num_heads * self.head_size * element_size + + @staticmethod + def by_name(name: str) -> "ModelInfo": + return _MODEL_INFO_BY_NAME[name] + + @staticmethod + def all() -> List["ModelInfo"]: + return _MODEL_INFO + + +def _humanize(parameters): + scale = min(int(math.log10(parameters)) // 3, 4) + suffix = " KMBT"[scale] + + return f"{parameters / math.pow(10, 3 * scale):.1f} {suffix}".rstrip() + + +class ModelInfoList(list): + def _repr_html_(self): + if not self: + return "" + columns = { + "Name": lambda m: f"{html.escape(m.name)}", + "Parameters": lambda m: _humanize(m.parameters), + "Layers": lambda m: str(m.num_layers), + "Heads": lambda m: str(m.num_heads), + "Head Size": lambda m: str(m.head_size), + "Vocabulary Size": lambda m: str(m.vocab_size), + "Context Size": lambda m: str(m.max_positions), + } + output = [""] + for col in columns: + output.append(f"") + output.append("") + for mi in self: + output.append("") + for extractor in columns.values(): + output.append(f"") + output.append("") + output.append("
{col}
{extractor(mi)}
") + return "".join(output) + + +_MODEL_INFO = ModelInfoList([ + ModelInfo("mini", num_layers=12, num_heads=12, head_size=64), + ModelInfo("base", num_layers=24, num_heads=32, head_size=64), + ModelInfo("standard", num_layers=32, num_heads=32, head_size=128), + ModelInfo("large", num_layers=48, num_heads=56, head_size=128), + ModelInfo("huge", num_layers=96, num_heads=80, head_size=128), +]) + +_MODEL_INFO_BY_NAME = {model.name: model for model in _MODEL_INFO} diff --git a/notebooks/Introduction to Galactica Models.ipynb b/notebooks/Introduction to Galactica Models.ipynb new file mode 100644 index 0000000..4d52f7e --- /dev/null +++ b/notebooks/Introduction to Galactica Models.ipynb @@ -0,0 +1,3346 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "909676ae", + "metadata": {}, + "source": [ + "# Introduction to GALACTICA Models\n", + "\n", + "Galactica is a family of language models trained on a novel high-quality scientific dataset, making the models capable of working with scientific terminology, math and chemical formulas as well as source codes.\n", + "\n", + "The easiest way to use the models is through our library called `galai` which provides convenience utilities to get the models, run generation and work with scientific entites of various types.\n", + "\n", + "This document is split into 5 main sections.\n", + "\n", + "* Quick Start\n", + "* The `huge` Model Capabilities\n", + " + Citations\n", + " + Step-by-Step Reasoning\n", + " + Storage Knowledge\n", + " + Compositions\n", + "* Text Generation & Sampling\n", + "* Working with Large Models\n", + "* Non-determinism\n", + "* Pitfalls & Failure Examples\n", + "\n", + "\n", + "**Note:** this notebook is best viewed using jupyter notebook or [nbviewer](https://nbviewer.org/). Other tools might not render all of our custom tokens, such as `` (which should be rendered as `< work >` without spaces) or `` (which should be rendered as `< / s >` without spaces). You can also view the PDF version of this notebook available in the same directory." + ] + }, + { + "cell_type": "markdown", + "id": "5a7c45ca", + "metadata": {}, + "source": [ + "# Quick Start\n", + "\n", + "You can install the `galai` library using `pip` (requires `python>=3.7`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd328502", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install galai" + ] + }, + { + "cell_type": "markdown", + "id": "9185f1ee", + "metadata": {}, + "source": [ + "Let's verify the installation by running generation with the base model (1.3B). We load it with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4833fedf", + "metadata": {}, + "outputs": [], + "source": [ + "import galai as gal\n", + "from galai.notebook_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9080e5a6", + "metadata": {}, + "outputs": [], + "source": [ + "model = gal.load_model(\"base\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c30cc34", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The Transformer architecture [START_REF] Attention is All you Need, Vaswani[END_REF] is a popular choice for sequence-to-sequence models. It consists of a stack of encoder and decoder layers, each of which is composed of a multi-head self-attention mechanism and a feed-forward network. The encoder is used to encode the'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.generate(\"The Transformer architecture [START_REF]\")" + ] + }, + { + "cell_type": "markdown", + "id": "ded9905c", + "metadata": {}, + "source": [ + "We can also generate math:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb16c212", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "The Riemann zeta function is given by:\n", + "\n", + "$$ \\zeta(s)=\\sum_{n=1}^{\\infty}\\frac{1}{n^{s}},\\quad\\Re(s)>1. $$\n", + "\n", + "The Riemann hypothesis (RH) states that the zeros of" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"The Riemann zeta function is given by:\\n\\n\\\\[\"\n", + "output = model.generate(prompt, max_new_tokens=60)\n", + "display_latex(output)" + ] + }, + { + "cell_type": "markdown", + "id": "545b5294", + "metadata": {}, + "source": [ + "There are 5 models in total (see more below in Model Selection Section):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6887504b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
NameParametersLayersHeadsHead SizeVocabulary SizeContext Size
mini125.0 M121264500002048
base1.3 B243264500002048
standard6.7 B3232128500002048
large30.0 B4856128500002048
huge121.3 B9680128500002048
" + ], + "text/plain": [ + "[ModelInfo(name='mini', num_layers=12, num_heads=12, head_size=64, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='base', num_layers=24, num_heads=32, head_size=64, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='standard', num_layers=32, num_heads=32, head_size=128, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='large', num_layers=48, num_heads=56, head_size=128, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='huge', num_layers=96, num_heads=80, head_size=128, vocab_size=50000, max_positions=2048)]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from galai.utils import ModelInfo\n", + "ModelInfo.all()" + ] + }, + { + "cell_type": "markdown", + "id": "3e60c591", + "metadata": {}, + "source": [ + "# The `huge` Model Capabilities" + ] + }, + { + "cell_type": "markdown", + "id": "13b4ee1c", + "metadata": {}, + "source": [ + "In this Section we present the capabilities of the Galactica models. We use the `huge` 121 B parameters model with tensor parallelizm (see the Working with Large Models Section for more details):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ea98dab", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "model = gal.load_model(\"huge\", parallelize=True)" + ] + }, + { + "cell_type": "markdown", + "id": "394588b3", + "metadata": {}, + "source": [ + "## Citations\n", + "\n", + "Galactica models are trained on a large corpus comprising more than 360 millions in-context citations and over 50 millions of unique references normalized across a diverse set of sources. This enables Galactica to suggest citations and help discover related papers.\n", + "\n", + "Each reference in our corpus is formatted as \"Title, First author\" and wrapped in a pair of `[START_REF]` / `[END_REF]` tokens. The tokens make it easy to steer the models into citing a reference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22ed4b21", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Galactica models are based on OPT architecture [START_REF] OPT: Open Pre-trained Transformer Language Models, Zhang[END_REF], which is a variant of the GPT-2 model [START_REF] Language Models are Unsupervised Multitask Learners, Radford[END_REF]. The OPT model is a 12-layer transformer with 12 attention heads and 768'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.generate(\"Galactica models are based on OPT architecture [START_REF]\")" + ] + }, + { + "cell_type": "markdown", + "id": "f6b26ebe", + "metadata": {}, + "source": [ + "To make it easier to generate references we provide a convenience function `Model.generate_reference` that automatically handles the `[START_REF]` / `[END_REF]` tokens and avoid generating more output than necessary for faster inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28b2b052", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'On the rapid computation of various polylogarithmic constants, Bailey'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.generate_reference(\"The paper introducing the formula for the $n$-th digit of $\\\\pi$ in base $16$\")" + ] + }, + { + "cell_type": "markdown", + "id": "839d59a6", + "metadata": {}, + "source": [ + "The call above appends `[START_REF]` token to the prompt, and runs the generation up to the first occurence of `[END_REF]` token.\n", + "\n", + "> Please note that while in the example above the returned paper (\"On the rapid computation of various polylogarithmic constants\" by Bailey et al.) matches the description, the generations should be treated as suggestions of papers and should always be verified. Bear in mind that due to the non-determinizm (see Non-deterministic Generation Section for more information) your results might be different." + ] + }, + { + "cell_type": "markdown", + "id": "93c563bf", + "metadata": {}, + "source": [ + "The multiple modalities that Galactica is able to work with allows us to query for papers using math, source code, etc.:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dfbd9d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt: The paper that presented a novel computing block given by the formula:\n", + "$$\n", + "f(Q, K, V) = \\textrm{softmax}\\left(\\frac{QK^T}{\\sqrt{d_k}}\\right)V\n", + "$$

\n", + "

Reference: Attention is All you Need, Vaswani

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"\"\"The paper that presented a novel computing block given by the formula:\n", + "\\\\[\n", + "f(Q, K, V) = \\\\textrm{softmax}\\\\left(\\\\frac{QK^T}{\\\\sqrt{d_k}}\\\\right)V\n", + "\\\\]\n", + "\n", + "\"\"\"\n", + "reference = model.generate_reference(prompt)\n", + "display_markdown(f\"**Prompt**: {prompt}\\n\\n**Reference**: {reference}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ab54dae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt:

\n", + "
while k > 1:\n",
+       "    if k % 2 == 0:\n",
+       "        k = k // 2\n",
+       "    else:\n",
+       "        k = 3 * k + 1\n",
+       "
\n", + "\n", + "

A paper studying if the loop above terminates for all positive integers

\n", + "

Reference: On the Collatz $3n+1$ algorithm, Garner

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"\"\"```python\n", + "while k > 1:\n", + " if k % 2 == 0:\n", + " k = k // 2\n", + " else:\n", + " k = 3 * k + 1\n", + "```\n", + "\n", + "A paper studying if the loop above terminates for all positive integers \"\"\"\n", + "reference = model.generate_reference(prompt)\n", + "display_markdown(f\"**Prompt**:\\n{prompt}\\n\\n**Reference**: {reference}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7d7b48bf", + "metadata": {}, + "source": [ + "You can get multiple suggestions of reference for a given prompt by setting `suggestions` parameter. With `suggestions > 1` a beam search decoding is used to try to generate more suggestions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb5ada04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Amyloid Hypothesis of Alzheimer's Disease: Progress and Problems on the Road to Therapeutics, Hardy\n", + "The amyloid cascade hypothesis for Alzheimer's disease: an appraisal for the development of therapeutics, Karran\n", + "The amyloid hypothesis of Alzheimer's disease at 25 years, Selkoe\n", + "The amyloid hypothesis of Alzheimer's disease at 25 years, Selkoe\n", + "The amyloid hypothesis of Alzheimer's disease at 25 years, Selkoe\n" + ] + } + ], + "source": [ + "for reference in model.generate_reference(\n", + " \"A survey paper on the amyloid hypothesis\",\n", + " suggestions=5\n", + "):\n", + " print(reference)" + ] + }, + { + "cell_type": "markdown", + "id": "e36dbf5d", + "metadata": {}, + "source": [ + "As apparent from the example above, some of the references may repeat. Setting `diversity_penalty` to a number between `0.0` and `1.0` switches the generation algorithm to [Diverse beam search](https://arxiv.org/pdf/1610.02424.pdf):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5f81db0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The amyloid hypothesis of Alzheimer's disease at 25 years, Selkoe\n", + "Alzheimer's disease: the amyloid cascade hypothesis., Hardy\n", + "The Amyloid Hypothesis of Alzheimer's Disease: Progress and Problems on the Road to Therapeutics, Hardy\n", + "Amyloid-β and tau: the trigger and bullet in Alzheimer disease pathogenesis., Bloom\n", + "The amyloid hypothesis of Alzheimer's disease at 25 years, Selkoe\n" + ] + } + ], + "source": [ + "for reference in model.generate_reference(\n", + " \"A survey paper on the amyloid hypothesis\",\n", + " suggestions=5, diversity_penalty=0.9\n", + "):\n", + " print(reference)" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "bd545100", + "metadata": {}, + "source": [ + "### Citation Distribution Bias\n", + "\n", + "Language models may encode and amplify biases present in the training corpus. Galactica models are biased towards referencing more frequently cited papers. Even though our analysis shows that as the model size increases the bias get smaller, the difference is still present:\n", + "\n", + "![image.png](attachment:image.png)\n", + "\n", + "See [our paper](https://galactica.org/static/paper.pdf) for more details." + ] + }, + { + "cell_type": "markdown", + "id": "af51ab15", + "metadata": {}, + "source": [ + "## Step-by-Step Reasoning\n", + "\n", + "Recent work (f.e., [Wei et al.](https://arxiv.org/abs/2201.11903), [Suzgun et al.](https://arxiv.org/abs/2210.09261)) have shown that chain-of-thought prompting can improve performance of large language models on complex reasoning tasks. In the NatureBook corpus used to train Galactica models we introduced a pair of special tokens - `` and `` to mark sections of fine-grained step-by-step reasoning. Explicit `` token makes it easier to bias the generation into step-by-step reasoning. Compare the two queries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a83d6d78", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: A bat and a ball cost $\\$1.10$ in total. The bat costs $\\$1.00$ more than the ball. How much does the ball cost?

\n", + "

Answer: $\\$0.10$

\n", + "

</work>

\n", + "

Ans: $\\$0.10$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = f\"Question: A bat and a ball cost $\\\\$1.10$ in total. The bat costs $\\\\$1.00$ more than the ball. How much does the ball cost?\\n\\nAnswer:\"\n", + "display_markdown(model.generate(prompt, new_doc=True, max_new_tokens=250))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "692bf1fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: A bat and a ball cost $\\$1.10$ in total. The bat costs $\\$1.00$ more than the ball. How much does the ball cost?

\n", + "

<work>

\n", + "

Let $x$ represent the ball's cost.

\n", + "

The bat costs $x+\\$1.00$.

\n", + "

The bat and the ball cost $x+(x+\\$1.00)=\\$1.10$.

\n", + "

$2x+\\$1.00=\\$1.10$

\n", + "

$2x=\\$0.10$

\n", + "

$x=\\$0.05$

\n", + "

The ball costs $\\$0.05$.

\n", + "

</work>

\n", + "

Ans: The ball costs $\\$0.05$.

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = f\"Question: A bat and a ball cost $\\\\$1.10$ in total. The bat costs $\\\\$1.00$ more than the ball. How much does the ball cost?\\n\\n\"\n", + "display_markdown(model.generate(prompt, new_doc=True, max_new_tokens=250))" + ] + }, + { + "cell_type": "markdown", + "id": "b946c79b", + "metadata": {}, + "source": [ + "### Python Evaluation\n", + "\n", + "Additionally, the `` section can include a python code used to run external computations. For example," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc0c9775", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

What is the $7$-th harmonic number of the second order? Answer with a source code.

\n", + "

<work>\n", + "harmonic.py

\n", + "
ans = sum(1/n**2 for n in range(1, 7 + 1))\n",
+       "\n",
+       "with open("output.txt", "w") as file:\n",
+       "    file.write(str(ans))\n",
+       "
\n", + "\n", + "

<<run: \"harmonic.py\">>

\n", + "

<<read: \"output.txt\">>

\n", + "

1.3852941429414294\n", + "</work>

\n", + "

A: 1.3852941429414294

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(\n", + " model.generate(\n", + " \"What is the $7$-th harmonic number of the second order? Answer with a source code.\\n\\n\",\n", + " max_new_tokens=300,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1458bff2", + "metadata": {}, + "source": [ + "While the numerical answer is incorrect, the generated code correctly implements the formula above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d47e8fdd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.511797052154195" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(1/n**2 for n in range(1, 7 + 1))" + ] + }, + { + "cell_type": "markdown", + "id": "023c2a4d", + "metadata": {}, + "source": [ + "## Stored knowledge\n", + "\n", + "We can use generation to retrieve definitions, formulas, source code and more:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85fd507e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Corticosteroid\n", + "\n", + " Corticosteroids are a class of steroid hormones that are produced in the adrenal cortex of vertebrates. They are involved in a wide range of physiological processes, including metabolism, immune function, and stress response.[START_REF] Corticosteroids: Mechanisms of Action in Health and Disease, Ramamoorthy[END_REF]\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(model.generate(\"# Corticosteroid\\n\", new_doc=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42bd5446", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "The $n$-th harmonic number of the second order is given by the formula:\n", + "\n", + "$$ H_{n}^{(2)}=\\sum_{k=1}^{n}\\frac{1}{k^{2}}. $$" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_latex(model.generate(\n", + " \"The \\\\(n\\\\)-th harmonic number of the second order is given by the formula:\\n\\n\\\\[\",\n", + " max_new_tokens=40,\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12dc2ab3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The IUPAC name of cortisol is: 11β,17α,21-trihydroxypregn-4-ene-3,20-dione.\n", + "\n", + "## See also\n", + "\n", + "* Cortisone\n", + "* Corticosterone\n", + "* Hydrocortisone\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(model.generate(\"The IUPAC name of cortisol is:\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "940675f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Mixing a kitchen salt with sulfuric acid results in the following reaction:\n", + "\n", + "$$ \\ce{NaCl}(aq)+\\ce{H2SO4}(aq)⟶\\ce{NaHSO4}(aq)+\\ce{HCl}(g) $$\n", + "\n", + "The hydrogen chloride gas is a strong acid and will react with any base that it comes into contact with.\n", + "\n", + "$$ \\ce" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_latex(model.generate(\"Mixing a kitchen salt with sulfuric acid results in the following reaction:\", max_new_tokens=80))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10e0d13e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Use find to list all PNG files larger than 1 megabyte:\n", + "\n", + "```\n", + "find . -name \"*.png\" -size +1M\n", + "```\n" + ] + } + ], + "source": [ + "print(model.generate(\"Use find to list all PNG files larger than 1 megabyte:\", max_new_tokens=25))" + ] + }, + { + "cell_type": "markdown", + "id": "0d9d3804", + "metadata": {}, + "source": [ + "## Composition\n", + "\n", + "Galactica models are able to mix & combine scientific modalities, stored knowledge and generalize to new tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75758fbd", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: Translate the following python code:

\n", + "
def cheapestProduct(products: List[Product]) -> Product:\n",
+       "    return min(products, key=lambda p: p.price)\n",
+       "
\n", + "\n", + "

into C++.

\n", + "

Answer:

\n", + "
Product cheapestProduct(std::vector<Product> products) {\n",
+       "    Product min_product = products[0];\n",
+       "    for (auto product : products) {\n",
+       "        if (product.price < min_product.price) {\n",
+       "            min_product = product;\n",
+       "        }\n",
+       "    }\n",
+       "    return min_product;\n",
+       "}\n",
+       "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"\"\"Question: Translate the following python code:\n", + "\n", + "```python\n", + "def cheapestProduct(products: List[Product]) -> Product:\n", + " return min(products, key=lambda p: p.price)\n", + "```\n", + "\n", + "into C++.\n", + "\n", + "Answer:\"\"\", max_new_tokens=150))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c4a0c4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: Translate the following math formula:

\n", + "

$$\n", + " \\zeta(s) = \\sum_{n=1}^{\\infty} n^{-s}\n", + "$$

\n", + "

into plain English.

\n", + "

Answer:

\n", + "

The zeta function is the sum of the reciprocals of the positive integers raised to the $s$th power.

\n", + "

</work>

\n", + "

Answer: $\\zeta(s) = \\sum_{n=1}^{\\infty} n^{-s}$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"\"\"Question: Translate the following math formula:\n", + "\n", + "\\\\[\n", + " \\\\zeta(s) = \\\\sum_{n=1}^{\\\\infty} n^{-s}\n", + "\\\\]\n", + "\n", + "into plain English.\n", + "\n", + "Answer:\"\"\", max_new_tokens=100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8ad61f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: Translate the following math formula:

\n", + "

$$\n", + " \\zeta(s) = \\sum_{n=1}^{\\infty} n^{-s}\n", + "$$

\n", + "

into python code.

\n", + "

Answer:

\n", + "
def zeta(s):\n",
+       "    return sum([n**-s for n in range(1, 100)])\n",
+       "
\n", + "\n", + "

The zeta function is a sum of an infinite number of terms. In order to compute it, we need to approximate it with a finite sum. The function above computes the sum of the first 100 terms.

\n", + "

The zeta function is a very important function in mathematics. It is used to compute

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"\"\"Question: Translate the following math formula:\n", + "\n", + "\\\\[\n", + " \\\\zeta(s) = \\\\sum_{n=1}^{\\\\infty} n^{-s}\n", + "\\\\]\n", + "\n", + "into python code.\n", + "\n", + "Answer:\"\"\", max_new_tokens=100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b67bbc08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: Translate the following math formula:

\n", + "

$$\n", + " f(x) = \\int_0^x \\frac{\\cos(2\\cdot t)}{\\sqrt{2\\pi}} dt.\n", + "$$

\n", + "

into python code using sympy package.

\n", + "

Answer:

\n", + "
from sympy import *\n",
+       "f = Integral(cos(2*t)/sqrt(2*pi), (t, 0, x))\n",
+       "
\n", + "\n", + "

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"\"\"Question: Translate the following math formula:\n", + "\n", + "\\\\[\n", + " f(x) = \\\\int_0^x \\\\frac{\\\\cos(2\\cdot t)}{\\\\sqrt{2\\\\pi}} dt.\n", + "\\\\]\n", + "\n", + "into python code using sympy package.\n", + "\n", + "Answer:\"\"\", max_new_tokens=45))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e9b26f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval $[a^2, b+c]$?

\n", + "

Answer: $\\frac{b+c+a^2}{2}$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = model.generate(\"\"\"Question: What is the expected value of a random variable uniformly distributed over the interval \\\\([a^2, b+c]\\\\)?\n", + "\n", + "Answer:\"\"\", max_new_tokens=20)\n", + "display_markdown(output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c07a1ad1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval $[a^2, b+c]$?

\n", + "

Answer: $\\frac{b+c+a^2}{2}$

\n", + "

Question: Rewrite the formula above in Mathematica.

\n", + "

Answer: $\\text{Expectation}[x,x\\sim\\text{UniformDistribution}[a^2,b+c]]$

\n", + "

Question: What is the expected value of a random variable uniformly distributed over the interval $[a, b]

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(\n", + " model.generate(\n", + " f\"{output.rstrip()}\\n\\nQuestion: Rewrite the formula above in Mathematica.\\n\\nAnswer:\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f39718b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: Translate the following python code:

\n", + "
import requests\n",
+       "import re\n",
+       "\n",
+       "def get_datasets():\n",
+       "    req = requests.get('https://paperswithcode.com/datasets')\n",
+       "    if req.ok:\n",
+       "        match = re.search(r'(\\d+) dataset results', req.text)\n",
+       "        return int(match.group(1)) if match else None\n",
+       "    return None\n",
+       "
\n", + "\n", + "

into Javascript.

\n", + "

Answer:

\n", + "
const getDatasets = () => {\n",
+       "  const req = fetch('https://paperswithcode.com/datasets')\n",
+       "  if (req.ok) {\n",
+       "    const match = /(\\d+) dataset results/.exec(req.text)\n",
+       "    return match ? parseInt(match[1]) : null\n",
+       "  }\n",
+       "  return null\n",
+       "}\n",
+       "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"\"\"Question: Translate the following python code:\n", + "\n", + "```python\n", + "import requests\n", + "import re\n", + "\n", + "def get_datasets():\n", + " req = requests.get('https://paperswithcode.com/datasets')\n", + " if req.ok:\n", + " match = re.search(r'(\\\\d+) dataset results', req.text)\n", + " return int(match.group(1)) if match else None\n", + " return None\n", + "```\n", + "\n", + "into Javascript.\n", + "\n", + "Answer:\"\"\", max_new_tokens=150))" + ] + }, + { + "cell_type": "markdown", + "id": "628a3eb5", + "metadata": {}, + "source": [ + "> **Please note that the generations are not guaranteed to be correct.** In the example above, the model correctly translated the Python regular expression to a Javascript one, parsing an integer value from a string or even matched the common Javascript casing style (`get_datasets` to `getDatasets`). However, the `req` is not handled correctly as a `Promise`. `Request.text` is a function returning a `Promise` as well.\n", + "See Pitfalls & Failure Examples Section for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24cd0811", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Use math facts to simplify the following python code:

\n", + "
def calc_sum(n):\n",
+       "    i = 0\n",
+       "    s = 0\n",
+       "    while i <= n:\n",
+       "        s += i\n",
+       "        i += 1\n",
+       "    return s\n",
+       "
\n", + "\n", + "

<work>

\n", + "

Let's look at the body of the while loop:

\n", + "
s += i\n",
+       "i += 1\n",
+       "
\n", + "\n", + "

There are two math operations here: += and +.

\n", + "

The += operator is an assignment operation. It assigns the value of i to s.

\n", + "

The + operator is a math operation that adds i to s.

\n", + "

The next step is to figure out the order of operations.

\n", + "

Assignment operations have the same order of operations as the code that follows.

\n", + "

Math operations have the same order of operations as the order of operations in standard math.

\n", + "

The order of operations in standard math is:

\n", + "
    \n", + "
  • Exponentiation
  • \n", + "
  • Multiplication and Division
  • \n", + "
  • Addition and Subtraction
  • \n", + "
\n", + "

So, the order of operations in the code is:

\n", + "
    \n", + "
  • Exponentiation
  • \n", + "
  • Multiplication and Division
  • \n", + "
  • Addition and Subtraction
  • \n", + "
  • Assignment
  • \n", + "
\n", + "

The next step is to figure out what the code is doing:

\n", + "
s += i\n",
+       "i += 1\n",
+       "
\n", + "\n", + "

The code is assigning the value of i to s and then adding 1 to i.

\n", + "

The first line can be rewritten as s = s + i.

\n", + "

The second line can be rewritten as i = i + 1.

\n", + "

The code can be rewritten as:

\n", + "
s = s + i\n",
+       "i = i + 1\n",
+       "
\n", + "\n", + "

The next step is to figure out the value of i and s after the loop.

\n", + "

The value of i after the loop is n + 1.

\n", + "

The value of s after the loop is (n + 1) * (n + 1) / 2.

\n", + "

The code can be rewritten as:

\n", + "
s = (n + 1) * (n + 1) / 2\n",
+       "i = n + 1\n",
+       "
\n", + "\n", + "

The next step is to figure out the expression that the code is calculating:

\n", + "
s = (n + 1) * (n + 1) / 2\n",
+       "i = n + 1\n",
+       "
\n", + "\n", + "

The code is calculating the sum of the numbers from 0 to n.

\n", + "

</work>

\n", + "

Answer:

\n", + "
s = (n + 1) * (n + 1) / 2\n",
+       "i = n + 1\n",
+       "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"\"\"Use math facts to simplify the following python code:\n", + "\n", + "```python\n", + "def calc_sum(n):\n", + " i = 0\n", + " s = 0\n", + " while i <= n:\n", + " s += i\n", + " i += 1\n", + " return s\n", + "```\n", + "\n", + "\"\"\", max_new_tokens=700))" + ] + }, + { + "cell_type": "markdown", + "id": "05cf15fa", + "metadata": {}, + "source": [ + "---\n", + "\n", + "We can see in this example that the initial error for the value of `s` after the loop is propagated to\n", + "the final answer. There's an off-by-one error and the correct value should be:\n", + "\n", + "```python\n", + "s = (n + 1) * n / 2\n", + "```\n", + "\n", + "Also, the model output has some incorrect statements, such as:\n", + "> The code is assigning the value of `i` to `s`" + ] + }, + { + "cell_type": "markdown", + "id": "32f10394", + "metadata": {}, + "source": [ + "### Few-Shot Prompts\n", + "\n", + "We can write a few-shot prompt to try to bias the generation into desired format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d9dc0c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: does \"kayak\" read the same backward as forward? Answer with code.

\n", + "

Code:

\n", + "
def is_palindrome(s):\n",
+       "    return s == s[::-1]\n",
+       "
\n", + "\n", + "

Answer: is_palindrome(\"kayak\").

\n", + "

Question: An $i$-th Peanut Butter number is given by the formula $pb_i = \\prod_{k=2}^{i} \\frac{1}{1-1/k}$. An $i$-th Jelly number is given by $J_i = \\sum_{k=2}^{i} pb_k$. What is the 6-th Jelly number? Answer with code.

\n", + "

Code:

\n", + "
def peanut_butter(i):\n",
+       "    return reduce(lambda x, y: x * y, map(lambda k: 1 / (1 - 1 / k), range(2, i + 1)))\n",
+       "\n",
+       "def jelly(i):\n",
+       "    return reduce(lambda x, y: x + y, map(lambda k: peanut_butter(k), range(2, i + 1)))\n",
+       "
\n", + "\n", + "

Answer: jelly(6).

\n", + "

Question: What is the largest prime factor of $2^{2017}-

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"\"\"Question: does \"kayak\" read the same backward as forward? Answer with code.\n", + "\n", + "Code:\n", + "\n", + "```python\n", + "def is_palindrome(s):\n", + " return s == s[::-1]\n", + "```\n", + "\n", + "Answer: `is_palindrome(\"kayak\")`.\n", + "\n", + "Question: An $i$-th Peanut Butter number is given by the formula $pb_i = \\\\prod_{k=2}^{i} \\\\frac{1}{1-1/k}$. An $i$-th Jelly number is given by $J_i = \\\\sum_{k=2}^{i} pb_k$. What is the 6-th Jelly number? Answer with code.\n", + "\"\"\", max_new_tokens=150))\n" + ] + }, + { + "cell_type": "markdown", + "id": "2921bfa4", + "metadata": {}, + "source": [ + "# Text Generation & Sampling\n", + "\n", + "The `galai` library uses HuggingFace [transformers](https://huggingface.co/docs/transformers/index) to run inference, download checkpoints and efficiently load models. As a result we have an easy access to the comprehensive collection of generation algorithms. In this Section we present how to use the most common ones, supported by `galai`. Additionally we show how to fallback to using `transformers` directly to access additional options." + ] + }, + { + "cell_type": "markdown", + "id": "2d9c4acc", + "metadata": {}, + "source": [ + "### Greedy Decoding\n", + "\n", + "This is the standard algorithm used by `Model.generate`. Using the prompt and already generated tokens, the model computes a probability distribution of the next token over all tokens. The token with the highest score is appended to the generated text and the process is repeated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5670fe5a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: what are the 10 most common text generation algorithms?

\n", + "

Answer:

\n", + "
    \n", + "
  • Beam search
  • \n", + "
  • Sampling
  • \n", + "
  • Greedy search
  • \n", + "
  • Nucleus sampling
  • \n", + "
  • Diverse beam search
  • \n", + "
  • Top-k sampling
  • \n", + "
  • Top-p sampling
  • \n", + "
  • Repetition penalty
  • \n", + "
  • Max length penalty
  • \n", + "
  • Length normalization
  • \n", + "
\n", + "

Question: what are some categories for text generation algorithms?

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_markdown(model.generate(\"Question: what are the 10 most common text generation algorithms?\\n\\nAnswer:\"))" + ] + }, + { + "cell_type": "markdown", + "id": "27cdf8dc", + "metadata": {}, + "source": [ + "### Beam Search\n", + "\n", + "In Beam Search, the model computes a probability distribution of the next token over all tokens for each of the `num_beams` generated sequences. The `num_beams` sequences with the highest probability are kept and the process is repeated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac01597b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
def is_palindrome(word):\n",
+       "    """\n",
+       "    Check if a word is a palindrome.\n",
+       "\n",
+       "    Parameters\n",
+       "    ----------\n",
+       "    word : str\n",
+       "        The word to check.\n",
+       "\n",
+       "    Returns\n",
+       "    -------\n",
+       "    bool\n",
+       "        True if the word is a palindrome, False otherwise.\n",
+       "\n",
+       "    Examples\n",
+       "    --------\n",
+       "    >>> is_palindrome("palindrome")\n",
+       "    True\n",
+       "    >>> is_palindrome("nonpalindrome")\n",
+       "    False\n",
+       "    """\n",
+       "    return word == word[::-1]\n",
+       "\n",
+       "\n",
+       "def is_palindromic_word(word):\n",
+       "    """\n",
+       "    Check if a word is a palindromic word\n",
+       "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"def is_palindrome\"\n", + "# greedy search\n", + "code = model.generate(prompt, max_new_tokens=150)\n", + "display_markdown(f\"```\\n{code}\\n```\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "097833c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
def is_palindrome(word: str) -> bool:\n",
+       "    """Check if a word is a palindrome.\n",
+       "\n",
+       "    Args:\n",
+       "        word (str): The word to check.\n",
+       "\n",
+       "    Returns:\n",
+       "        bool: True if the word is a palindrome, False otherwise.\n",
+       "    """\n",
+       "    return word == word[::-1]\n",
+       "\n",
+       "\n",
+       "def is_palindrome_strict(word: str) -> bool:\n",
+       "    """Check if a word is a strict palindrome.\n",
+       "\n",
+       "    Args:\n",
+       "        word (str): The word to check.\n",
+       "\n",
+       "    Returns:\n",
+       "        bool: True if the word is a strict palindrome, False otherwise.\n",
+       "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# beam search\n", + "code = model.generate(prompt, num_beams=5, max_new_tokens=150)\n", + "display_markdown(f\"```\\n{code}\\n```\")" + ] + }, + { + "cell_type": "markdown", + "id": "d2ea8a38", + "metadata": {}, + "source": [ + "You can return up to `num_beams` sequences by specifying `num_return_sequences`." + ] + }, + { + "cell_type": "markdown", + "id": "2fb2dab1", + "metadata": {}, + "source": [ + "Beam search is slower and requires more memory compared to the Greedy Decoding. The increase in memory consumption is proportional to the number of beams used." + ] + }, + { + "cell_type": "markdown", + "id": "c6f78ad1", + "metadata": {}, + "source": [ + "### Contrastive Search\n", + "\n", + "The contrastive search ([Su et al.](https://arxiv.org/abs/2202.06417), [Su et al.](https://arxiv.org/abs/2210.14140)) algorithm is a novel generation method that aims to produce more natural texts by penalizing repetitions. We can use `transformers` implementation (see more at https://huggingface.co/blog/introducing-csearch) by specifying `penalty_alpha` and `top_k`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ccaf80e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Title: A Literature Review on Alzheimer's Disease\n", + "\n", + "# Abstract\n", + "\n", + "Alzheimer's disease (AD) is a neurodegenerative disease that affects millions of people worldwide. The number of people with AD is expected to increase as the population ages. Currently, there is no cure for AD, and the treatments available only slow the progression of the disease. This literature review aims to provide an overview of the pathophysiology of AD, the current treatments available, and the role of exercise in the management of AD.\n", + "\n", + "# 1. Introduction\n", + "\n", + "\n", + "Alzheimer's disease (AD) is a neurodegenerative disease that affects millions of people worldwide. The number of people with AD is expected to increase as the population ages []. Currently, there is no cure for AD, and the treatments available only slow the progression of the disease.\n", + "\n", + "# 2. Pathophysiology\n", + "\n", + "AD is characterized by the presence of amyloid plaques and neurofibrillary tangles in the brain. Amyloid plaques are formed by the accumulation of amyloid-beta (Aβ) peptides, which are produced by the cleavage of amyloid precursor protein (APP) by β-secretase and γ-secretase [[START_REF] A systemic view of Alzheimer disease – insights from amyloid-beta metabolism beyond the brain, Wang[END_REF]]. Neurofibrillary tangles are formed by the aggregation of hyperphosphorylated tau protein, which is involved in the stabilization of microtubules [[START_REF] Tau in Alzheimer disease and related tauopathies., Iqbalabrahams」aja[END_REF]].\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Title: A Literature Review on Alzheimer's Disease\\n\\n# Abstract\\n\",\n", + " top_k=4, penalty_alpha=0.6, max_new_tokens=300\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "35d92dce", + "metadata": {}, + "source": [ + "---\n", + "## Sampling\n", + "\n", + "Instead of selecting tokens with the highest scores we can use the scores to model a probability distribution to sample the tokens from.\n", + "\n", + "### Nucleus Sampling\n", + "\n", + "In Nucleus sampling (see [Holtzman et al.](https://arxiv.org/abs/1904.09751)) the tokens to sample from are limited to most likely tokens which total probability does not exceed `top_p` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff2c7bc0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " # Image Denoising\n", + "\n", + "Image denoising is the process of removing noise from an image, and it is an important task in the field of image processing. Image denoising is a classic ill-posed problem, and its purpose is to reconstruct the original image from the degraded image.\n", + "\n", + "We test our method on the benchmark\n" + ] + } + ], + "source": [ + "print(model.generate(\" # Image Denoising\", top_p=0.7))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6d5f0aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " # Image Denoising\n", + "\n", + "Image denoising is a well-known inverse problem in image processing and computer vision. A lot of works have been done to tackle this problem. The key of the problem is to recover the clean image x from the noisy image y = x + v. In the past decade, there\n" + ] + } + ], + "source": [ + "print(model.generate(\" # Image Denoising\", top_p=0.7))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142951d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " # Image Denoising\n", + "\n", + "The following section is dedicated to the application of our model to the denoising of images corrupted by additive white Gaussian noise. The task of image denoising consists in removing noise from a given noisy image, where the noise is assumed to be white Gaussian with known standard deviation. In this setting, the forward\n" + ] + } + ], + "source": [ + "print(model.generate(\" # Image Denoising\", top_p=0.7))" + ] + }, + { + "cell_type": "markdown", + "id": "9a0e2c73", + "metadata": {}, + "source": [ + "---\n", + "With `top_p=1.0` all tokens are included and we get standard sampling." + ] + }, + { + "cell_type": "markdown", + "id": "d633a99d", + "metadata": {}, + "source": [ + "#### Top-K Sampling\n", + "\n", + "In top-k sampling the tokens to sample from are limit to `top_k` most likely tokens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bac79d01", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " # Image Denoising\n", + "\n", + "The task of image denoising is to remove the unwanted signal corruptions from the image. There is a rich body of literature [START_REF] Image restoration: total variation, wavelet frames, and beyond, Cai[END_REF][START_REF] A Review of Image Denoising Algorithms, with a New One, Buades[END_REF][START_REF]\n" + ] + } + ], + "source": [ + "print(model.generate(\" # Image Denoising\", top_k=10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e4ca7dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " # Image Denoising\n", + "\n", + "In the following section, we apply the proposed method for image denoising and compare with the recent state-of-the-art. The noisy image y, is modeled as\n", + "\n", + "y = x + n\n", + "where x is the original noise free image and n is the additive white\n" + ] + } + ], + "source": [ + "print(model.generate(\" # Image Denoising\", top_k=10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b52efb36", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " # Image Denoising #\n", + "###############################\n", + "## # Image Denoising #\n", + "###############################\n", + "def denoise_tv_chambolle(noisy_image,\n", + " weight_decay = 0.3,\n", + " weight_gradients = 0.\n" + ] + } + ], + "source": [ + "print(model.generate(\" # Image Denoising\", top_k=10))" + ] + }, + { + "cell_type": "markdown", + "id": "214e0ded", + "metadata": {}, + "source": [ + "---\n", + "Both `top_p` and `top_k` can be used at the same time." + ] + }, + { + "cell_type": "markdown", + "id": "1640facd", + "metadata": {}, + "source": [ + "### Using `transformers` Directly\n", + "\n", + "You can generate text with Galactica models directly using HuggingFace `transformers` library. One option is to use the model and tokenizer from the `galai.Model`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "643bc9d1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 50000])\n" + ] + } + ], + "source": [ + "def transformers_generate(model, prompt, new_doc=False, **options): \n", + " tokens = model._tokenize([prompt], new_doc=new_doc)\n", + " out = model.model.generate(\n", + " tokens,\n", + " **options\n", + " )\n", + " return out\n", + "\n", + "out = transformers_generate(\n", + " model,\n", + " \"In this paper, we study\",\n", + " max_new_tokens=40,\n", + " return_dict_in_generate=True,\n", + " output_scores=True\n", + ")\n", + "print(out.scores[0].shape)" + ] + }, + { + "cell_type": "markdown", + "id": "cabd8bc0", + "metadata": {}, + "source": [ + "This approach makes sure that the tokenization is done properly: the end-of-document token correctly handles padding and custom sequences are split.\n", + "\n", + "You can also use Galactica models soley using `transformers`, for example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b9a0bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Transformer architecture [START_REF] Attention is All you Need, Vaswani[END_REF] is a popular choice for sequence-to-sequence models\n" + ] + } + ], + "source": [ + "import torch\n", + "from transformers import AutoTokenizer, OPTForCausalLM\n", + "\n", + "transformers_tokenizer = AutoTokenizer.from_pretrained(\"facebook/galactica-1.3b\")\n", + "transformers_model = OPTForCausalLM.from_pretrained(\"facebook/galactica-1.3b\", torch_dtype=torch.float16, device_map=\"auto\")\n", + "\n", + "input_text = \"The Transformer architecture [START_REF]\"\n", + "input_ids = transformers_tokenizer(input_text, return_tensors=\"pt\").input_ids.to(\"cuda\")\n", + "\n", + "outputs = transformers_model.generate(input_ids, max_new_tokens=20)\n", + "print(transformers_tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "markdown", + "id": "2ad40d6c", + "metadata": {}, + "source": [ + "## Tokenization\n", + "\n", + "All Galactica models share the same vocabulary of 50000 tokens. The vocabulary was trained on 2% of our training corpus using Byte-Pair Encoding (BPE) tokenization.\n", + "\n", + "### Special Tokens\n", + "\n", + "Some of the tokens (f.e., the already mentioned `[START_REF]` or ``) are special control tokens that can be used to steer model generation towards a specific type of content.\n", + "\n", + "\n", + "`` - reserved.\n", + "\n", + "`` - reserved.\n", + "\n", + "`` - end-of-document token used to split documents during trainig. Prepending this token to prompt (see `new_doc` parameter in `Model.generate`) biases a model into generating a new document.\n", + "\n", + "`` - a standard padding token to align sequences in a batch.\n", + "\n", + "`[START_REF]` and `[END_REF]` - markers denoting a reference to a paper. Each paper is represented as `Title, First author name`. F.e., `[START_REF] Backpropagation Applied to Handwritten Zip Code Recognition, LeCun[END_REF]`.\n", + "\n", + "`[IMAGE]` - a placeholder for an image removed from a text.\n", + "\n", + "`` and `` - markers denoting fragments in FragmentedGlass dataset.\n", + "\n", + "`` and `` - markers denoting step-by-step reasoning (see Step-by-Step Reasoning Section).\n", + "\n", + "`[START_SUP]`, `[END_SUP]`, `[START_SUB]` and `[END_SUB]` - markers used to protect superscript and subscript digits from NFKC normaliziation. Our tokenizer uses the standard NFKC rules, which means that `x²⁵` would be tokenized in the same way as `x25`. To prevent this, we encode `x²⁵` as `x[START_SUP]25[END_SUP]`.\n", + "\n", + "`[START_DNA]`, `[END_DNA]`, `[START_AMINO]`, `[END_AMINO]`, `[START_SMILES]`, `[END_SMILES]`, `[START_I_SMILES]` and `[END_I_SMILES]` - markers denoting special sequences, respectively: nucleic acids sequences, amino acids sequeqnces, canonical simplified molecular-input line-entry system (SMILES) strings and isometric SMILES strings. Besides marking a sequence of a given type, these tokens force a special tokenization mode in which each character is represented as a single token. F.e., `GATTACA` is tokenized as `G|ATT|ACA`, while `[START_DNA]GATTACA[END_DNA]` is tokenized as `[START_DNA]|G|A|T|T|A|C|A|[END_DNA]`. Note that for this to work you need to transform your prompt with `galai.utils.escape_custom_split_sequence`. All standard text generation functions of `galai.model.Model` do this automatically.\n", + "\n", + "The `galai` library takes care of handling of the special tokens. If you are using `tokenizers` directly then most likely you want to keep the special tokens in the output for further processing. Set `skip_special_tokens=False` in `tokenizers.Tokenizer.decode`.\n", + "\n", + "### Decoupling of Tokens\n", + "\n", + "The BPE training algorithm creates vocabulary based on frequncies of subwords in the training corpus, with more frequent subwords being represented with fewer number of tokens. This means that visually similar subwords may end up having totally different token representations. For example, in the GPT-2 tokenizer (trained before year 2020) each of the numbers `{2000, 2001, ..., 2020}` is encoded with a unique token, and all of the numbers `{2021, 2022, ..., 2030}` are represented as two tokens: `20|21`, `20|22`, etc. Training on a corpus with math, TeX formulas and source code it can happen that a single token encodes multiple independent functions. F.e., `\\(-` can end up being a single token making prompting more difficult and the model less robust to changes in spaces.\n", + "\n", + "To prevent this issue we implemented custom splitting rules, presented in the example below. For performance reasons we keep a leading space (i.e., ` text` can be a single token)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02a6536f", + "metadata": {}, + "outputs": [], + "source": [ + "from galai.utils import escape_custom_split_sequence\n", + "from IPython.display import HTML\n", + "import html\n", + "\n", + "def tokenization_example(tokenizer, text):\n", + " text = escape_custom_split_sequence(text)\n", + " tokens = [tokenizer.decode([x], skip_special_tokens=False) for x in tokenizer.encode(text).ids]\n", + " spans = \"\".join([html.escape(t).replace(\" \", \"▁\").replace(\"\\n\", \"\\\\n\") for t in tokens])\n", + " style = \"\"\n", + " return HTML(style + \"
\" + spans + \"
\")\n", + "\n", + "tokenization_example(model.tokenizer, r\"\"\"Tokenization of most of the natural texts is not impacted by the rules.\n", + "However, most of the non-alphanumeric ASCII characters are split. This is mostly visible in TeX formulas,\n", + "for example: $\\frac{d}{dx}\\,\\cos(x) = -\\sin(x)$, \\(\\zeta(s)=\\sum_{n=1}^{\\infty} n^{-s}\\). \n", + "It also impacts source codes, like: x+=((1,2)); \n", + "As a side-effect, contractions (I'll, you've, it's, etc.) and emoticons (like this Santa Claus *<|:‑) ) are split. \n", + "This rule makes exception for a repeated sequence of the same character, so f.e., ---------------- is still a single token. \n", + "Additionally, EOL character is always split, so that \n", + "\n", + "\n", + "\n", + "\n", + "are 5 tokens. \n", + "Numbers are slit into individual digits as before, f.e., $$\\pi=3.14159265\\ldots$$ \n", + "Note that non-alphanumeric splitting splits space in front as well (f.e., i ++, x <-> y, if ( x <= y )). \n", + "Special tokens like [START_REF], or [IMAGE] are left intact. \n", + "The tokenizer additionally supports custom sequence splitting (does not work by default, requires a custom preprocessing step), f.e.: \n", + "[START_DNA]GATTACA[END_DNA], [START_AMINO]PEPTIDES[END_AMINO], \n", + "[START_SMILES]CC(=O)NCCC1=CNc2c1cc(OC)cc2[END_SMILES] and [START_I_SMILES]CN1CCC[C@H]1c2cccnc2[END_I_SMILES]\"\"\")\n" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "233b44bf", + "metadata": {}, + "source": [ + "![image.png](attachment:image.png)" + ] + }, + { + "cell_type": "markdown", + "id": "03a6526a", + "metadata": {}, + "source": [ + "## Model Selection\n", + "\n", + "There are 5 models in total, ranging in size from 125 million parameter up to 121 billion parameters. The model architecture is practically the same as the architecture of OPT models (see [Zhang et al.](https://arxiv.org/abs/2205.01068))." + ] + }, + { + "cell_type": "markdown", + "id": "2a2255a0", + "metadata": {}, + "source": [ + "## Working with Large Models\n", + "\n", + "### Loading a model\n", + "\n", + "There are 5 galactica models to choose from, ranging in size from 125 million to 121 billion parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ed65f9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
NameParametersLayersHeadsHead SizeVocabulary SizeContext Size
mini125.0 M121264500002048
base1.3 B243264500002048
standard6.7 B3232128500002048
large30.0 B4856128500002048
huge121.3 B9680128500002048
" + ], + "text/plain": [ + "[ModelInfo(name='mini', num_layers=12, num_heads=12, head_size=64, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='base', num_layers=24, num_heads=32, head_size=64, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='standard', num_layers=32, num_heads=32, head_size=128, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='large', num_layers=48, num_heads=56, head_size=128, vocab_size=50000, max_positions=2048),\n", + " ModelInfo(name='huge', num_layers=96, num_heads=80, head_size=128, vocab_size=50000, max_positions=2048)]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from galai.utils import ModelInfo\n", + "ModelInfo.all()" + ] + }, + { + "cell_type": "markdown", + "id": "fa1c3ff5", + "metadata": {}, + "source": [ + "To load a model use the `load_model()` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f68177b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function load_model in module galai:\n", + "\n", + "load_model(name: str, dtype: Union[str, torch.dtype] = None, num_gpus: int = None, parallelize: bool = False)\n", + " Utility function for loading the model\n", + " \n", + " Parameters\n", + " ----------\n", + " name: str\n", + " Name of the model\n", + " \n", + " dtype: str\n", + " Optional dtype; default float32 for all models but 'huge'\n", + " \n", + " num_gpus : int (optional)\n", + " Number of GPUs to use for the inference. If None, all available GPUs are used. If 0 (or if\n", + " None and there are no GPUs) only a CPU is used. If a positive number n, then the first n CUDA\n", + " devices are used.\n", + " \n", + " parallelize : bool; default False\n", + " Specify if to use model tensor parallelizm. Ignored in CPU or single GPU inference.\n", + " \n", + " By the default (when parallelize is False) the multi-GPU inference is run using accelerate's\n", + " pipeline parallelizm in which each GPU is responsible for evaluating a given subset of\n", + " model's layers. In this mode evaluations are run sequentially. This mode is well suited for\n", + " developing in model's internals as it is more robust in terms of recovering from exceptions\n", + " due to not using additional processes. However, because of the sequential nature of\n", + " pipeline parallelizm, at any given time only a single GPU is working.\n", + " \n", + " If parallelize is True, parallelformers' model tensor parallelizm is used instead.\n", + " \n", + " Returns\n", + " ----------\n", + " Model - model object\n", + "\n" + ] + } + ], + "source": [ + "help(gal.load_model)" + ] + }, + { + "cell_type": "markdown", + "id": "90efcf2a", + "metadata": {}, + "source": [ + "#### CPU Inference\n", + "\n", + "The default call to `load_model` uses all available CUDA devices. If no device is found the model is loaded to RAM instead. Set `num_gpus=0` to force CPU inference even if CUDA-capable devices are present.\n", + "\n", + "#### MPS (Metal Programming Shaders) Inference\n", + "\n", + "To run the model on Mac OS on Apple GPUs simply call `model.model.to(\"mps\")` right after loading the model." + ] + }, + { + "attachments": { + "parallel.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "ac9a3a02", + "metadata": {}, + "source": [ + "### Multi-GPU Inference\n", + "\n", + "We support two types of model parallelizm to enable multi-GPU inference: pipeline parallelizm (using [accelerate](https://huggingface.co/docs/accelerate/)) and tensor parallelizm (using [parallelformers](https://tunib-ai.github.io/parallelformers/)). A greatly simplified comparison of the two modes is depicted below:\n", + "\n", + "![parallel.png](attachment:parallel.png)\n", + "\n", + "In the pipeline parallel mode (`gal.load_model(..., parallelize=False)`, the default) the model weights are split by layers and the input is processed sequentially. This simplifies the synchronization operations required to run the inference. As a result in this mode it's easier to recover from internal model exceptions (like CUDA OOM), inspect model weights or access internal states. However, because the input is being processed sequentially, at any given time only a single GPU is working.\n", + "\n", + "To speed up the inference you can load a model with tensor parallelizm enabled (`gal.load_model(..., parallelize=True)`). In this mode the input is split into parts that are processed in parallel. Underneath we use `parallelformers` library that slices transformer-based decoder modules into [Megatron-LM](https://arxiv.org/pdf/1909.08053.pdf) tensor parallel modules. To process the input in parallel, `parallelformers` spawns one additional process for each GPU. As a side effect of this approach, state changes (such as `torch.no_grad()` or `torch.manual_seed()`) triggered from the main process are not visible inside those processes, unless they are manually propagated.\n", + "\n", + "In general, both `accelerate` and `parallelformes` have different characteristics in terms of memory usage, communication overhead, inference speed and ease of use (in case of modifying a model internals), so it's best to compare the two in your particular environment. Below we compare the inference time of the `huge` model in half precision on 8 A100 (40GB VRAM, PCIe), an average of 5 runs after a single warm up run:\n", + "\n", + "| Call | Batch Size | Prompt length | Generated Tokens | Time (accelerate) | Time (parallelformers) |\n", + "|:-----------|-----------:|--------------:|-----------------:|------------------:|-----------------------:|\n", + "| generate() | 4 | 100 | 200 | 48 s | 18 s |\n", + "\n", + "### Disk Space Requirements\n", + "\n", + "All of the checkpoints files use float16 weights, so on disk file size in bytes is around two times the number of parameters. F.e., the `standard` 6.7B model requires around 13.7 GB of disk space. You can specify different download location by setting the `TRANSFORMERS_CACHE` environment variable accordingly. Make sure to set the variable before importing `transformers` module (including indirect import through `galai`).\n", + "\n", + "### Memory Requirements\n", + "\n", + "The memory requirement of the models depends on the inference mode. Loading the model in float16 requires two bytes per parameter. That means that f.e., the `large` 30B model requires around 60 GB of memory. Using the full float32 precision doubles the required memory.\n", + "\n", + "Besides the model weights one have to include memory size required to store intermediate activations and cached outputs. The cache size can be computed using `ModelInfo`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33ca4635", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9.4 GB\n" + ] + } + ], + "source": [ + "batch_size = 8\n", + "longest_prompt_length = 100\n", + "max_new_tokens = 200\n", + "cache_size = ModelInfo.by_name(\"huge\").memory_per_token(dtype=\"float16\") * batch_size * (longest_prompt_length + max_new_tokens)\n", + "print(f\"{cache_size / 1e9:.1f} GB\")" + ] + }, + { + "cell_type": "markdown", + "id": "c1467b01", + "metadata": {}, + "source": [ + "## Non-deterministic Generation\n", + "\n", + "While the outputs presented above are quite robust you might notice some differences depending on the exact environment you are using to run the inference. Additionally, even using the exact same environment the outputs might change due to multiple source of non-determinizm in the generation process. Except for the cases in which non-determinism is by design (i.e., sampling outputs with top_p or top_k) or the standard pytorch and CUDA non-determinizm (see https://pytorch.org/docs/stable/notes/randomness.html), there are various reasons for the outputs to be different between environments or between runs on the same environment. Due to an accumulation of numeric errors, the differences are more likely to occur for bigger models and longer sequences. Below is a list of common sources of non-determinizm:\n", + "\n", + "* different dtype used for inference: `float32` vs `float16` vs `bfloat16`.\n", + "* different transformers version. We recommended using `transformers >= 4.24` to take advantage of stability improvements in OPT models implementation.\n", + "* different pytorch version. We recommended using `torch >= 1.12` to take advantage of more stable implementation of LayerNorm.\n", + "* different input shape: batch size and padding.\n", + "* different parallelizm mode: pipeline parallel vs tensor parallel. Additionally, as noted in Multi-GPU Inference Section, manually setting seed values does not work out of the box with parallelformers.\n", + "* running inference in the training mode. The model architecture includes dropout regularization in several places, which is turned of in the evaluation mode.\n", + "* differences in prompts: while larger models should be more robust to subtle changes in a prompt, the slightly different input (f.e., two spaces instead of one, additional new line, using `\\$` `\\$` LaTeX delimiters instead of `\\(` `\\)` or no delimiters at all) might results in totally different output." + ] + }, + { + "cell_type": "markdown", + "id": "462c4f3d", + "metadata": {}, + "source": [ + "# Pitfalls & Failure Examples\n", + "\n", + "While Galactica language models enable one to analyze and work with scientific data in multiple new ways, it's important to understand the shortcomings of the models. We present here examples of cases in which the models don't work as expected. This section is by no means exhaustive.\n", + "\n", + "## Hallucinations\n", + "\n", + "The language models are trained with an objective of predicting the next token based on the previous tokens. As a result, the text generated by the models may be non-factual or simply made up:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ff5326c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Ignacy Jan Paderewski\n", + "\n", + " Ignacy Jan Paderewski (Polish: [iɡˈnatsɨ ˈjan padɛˈrɛfskji]; 12 March 1860 – 13 March 1941) was a Polish pianist, composer, and statesman. He was a leading figure in the international music world of the late 19th and early 20th centuries. He was a virtuoso pianist, composer, and conductor, and a political activist who served as the Prime Minister of\n" + ] + } + ], + "source": [ + "print(model.generate(\"# Ignacy Jan Paderewski\\n\", max_new_tokens=120))" + ] + }, + { + "cell_type": "markdown", + "id": "14fbd703", + "metadata": {}, + "source": [ + "Compare the output with [the wikipedia entry](https://en.wikipedia.org/wiki/Ignacy_Jan_Paderewski):\n", + "\n", + "> Ignacy Jan Paderewski (Polish: [iɡˈnatsɨ ˈjan padɛˈrɛfskʲi]; 18 November [O.S. 6 November] 1860 – 29 June 1941) was a Polish pianist and composer who became a spokesman for Polish independence. In 1919, he was the new nation's Prime Minister and foreign minister during which he signed the Treaty of Versailles, which ended World War I." + ] + }, + { + "cell_type": "markdown", + "id": "c9480d88", + "metadata": {}, + "source": [ + "The issue is especially visible in case of prompts with incorrect assumptions, in which a prompt already includes made up statements:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b79f2830", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "The Einstein-Presley-Lewandowski equation is given by:\n", + "\n", + "$$ \\displaystyle\\frac{\\partial\\rho}{\\partial t}=-\\frac{i}{\\hbar}[H,\\rho]+\\frac{%\n", + "\\gamma}{2}\\left(2a\\rho a^{\\dagger}-a^{\\dagger}a\\rho-\\rho a^{\\dagger}a\\right) $$\n", + "$$ \\displaystyle+\\frac{\\gamma(\\bar{n}+1)}{2}\\left(2a^{\\dagger}\\rho a-aa^{\\dagger}%\n", + "\\rho-\\rho aa^{\\dagger}\\right) $$\n", + "$$ \\displaystyle+\\frac{\\gamma\\bar{n}}{2}\\left(2a\\rho a^{\\dagger}-a^{\\dagger}a\\rho%\n", + "-\\rho" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display_latex(\n", + " model.generate(\n", + " \"The Einstein-Presley-Lewandowski equation is given by:\\n\",\n", + " max_new_tokens=200,\n", + " new_doc=True\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a44b9fed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: what was the main reason that lead to the duel between Richard Feynman and Jadwiga of Poland?\n", + "\n", + "Answer: Feynman's refusal to accept the validity of the Pauli exclusion principle\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Question: what was the main reason that lead to the duel between Richard Feynman and Jadwiga of Poland?\\n\\nAnswer:\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8be1d402", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: what is the largest prime number?\n", + "\n", + "Answer: 1000000007\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Question: what is the largest prime number?\\n\\nAnswer:\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cccce667", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: is there the largest prime number?\n", + "\n", + "Answer: No\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Question: is there the largest prime number?\\n\\nAnswer:\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ff995dc7", + "metadata": {}, + "source": [ + "---\n", + "The Galactica models are not multi-lingual by design. Most of the natural language documents in the NatureBook corpus are written in **English**. Prompting in different language results in more random generations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d38f3692", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " # Galaxia\n", + "Una galaxia es un conjunto de estrellas, galaxias, sistemas planetarios, etc. que se encuentran en una determinada region del universo.\n", + "\n", + "Galaxia es una herramienta que permite generar simulaciones de galaxias en un determinado momento del universo.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "# Spanish prompt\n", + "print(model.generate(\" # Galaxia\\nUna galaxia es un conjunto de estrellas,\", new_doc=True, max_new_tokens=65))" + ] + }, + { + "cell_type": "markdown", + "id": "0d733c31", + "metadata": {}, + "source": [ + "---\n", + "A translation by a native speaker:\n", + "> A galaxy is a group of stars, galaxies, planetary systems, etc. that are located in a specific region of the universe.\n", + "Galaxy is a tool to generate galaxy simulations at a specific time of the Universe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca8507bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: how do you say 'Good morning' in French?\n", + "\n", + "Answer: Bonjour\n" + ] + } + ], + "source": [ + "print(model.generate(\"Question: how do you say 'Good morning' in French?\\n\\nAnswer:\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a60710e6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: how do you say 'Good morning' in Polish?\n", + "\n", + "Answer: Dziękuję!\n", + "\n", + "Answer: Dziękuję!\n", + "\n", + "Answer: Dziękuję!\n", + "\n", + "Answer: Dziękuję!\n", + "\n", + "Answer: Dziękuję!\n", + "\n", + "Answer: Dziękuj\n" + ] + } + ], + "source": [ + "print(model.generate(\"Question: how do you say 'Good morning' in Polish?\\n\\nAnswer:\"))" + ] + }, + { + "cell_type": "markdown", + "id": "12985b70", + "metadata": {}, + "source": [ + "---\n", + "Translation of the answer:\n", + "> Thank you!" + ] + }, + { + "cell_type": "markdown", + "id": "4df5d75d", + "metadata": {}, + "source": [ + "The NatureBook corpus was assembled in July 2022, so the models have no information about anything that happened after." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4d11f68", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Elizabeth II\n", + "\n", + " Elizabeth II (Elizabeth Alexandra Mary Windsor; born 21 April 1926) is Queen of the United Kingdom and the other Commonwealth realms, including Canada, Australia, New Zealand, Jamaica, Barbados, and 15 other Commonwealth countries\n" + ] + } + ], + "source": [ + "print(model.generate(\"# Elizabeth II\\n\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8f07507", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: What year is it?\n", + "\n", + "Answer: 1997\n" + ] + } + ], + "source": [ + "print(model.generate(\"Question: What year is it?\\n\\nAnswer:\", new_doc=True))" + ] + }, + { + "cell_type": "markdown", + "id": "9004276f", + "metadata": {}, + "source": [ + "## Prompt Robustness\n", + "\n", + "The model output may depend on seemingly insignificant variations in prompts, especially in case of the smaller models. This Section presents examples of prompts in which small change results in different outputs." + ] + }, + { + "cell_type": "markdown", + "id": "5615092d", + "metadata": {}, + "source": [ + "### Spelling Errors\n", + "\n", + "A large part of the NatureBook corpus consists of documents using a formal and technical language. The model output may change depending on spelling, punctuation and grammatical errors in a prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a2a038", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: Write a python function that checks if an input string is a palindrome.\n", + "\n", + "Answer:\n", + "\n", + "```\n", + "def is_palindrome(s):\n", + " return s == s[::-1]\n", + "```\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Question: Write a python function that checks if an input string is a palindrome.\\n\\nAnswer:\",\n", + " max_new_tokens=30\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc26491d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: Write python function that check if input string is palindrome.\n", + "\n", + "Answer: def is_palindrome(s):\n", + " if len(s) < 2:\n", + " return True\n", + " return s[0] == s[-1] and is_palindrome(s[1:-1])\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Question: Write python function that check if input string is palindrome.\\n\\nAnswer:\",\n", + " max_new_tokens=60\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1181d9c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Ignacy Jan Paderewski\n", + "\n", + " Ignacy Jan Paderewski (Polish: [iɡˈnatsɨ ˈjan padɛˈrɛfskji]; 12 March 1860 – 13 March 1941) was a Polish p\n" + ] + } + ], + "source": [ + "print(model.generate(\"# Ignacy Jan Paderewski\\n\")) # correct name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ad1649f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Ignacy Jan Paderweski\n", + "\n", + " Ignacy Jan Paderweski (1770–1830) was a Polish painter, a representative of the Polish school of painting.\n", + "\n", + "## Biography\n", + "\n", + " He was born in Warsaw, the son of a painter, Franciszek Paderwski. He\n" + ] + } + ], + "source": [ + "print(model.generate(\"# Ignacy Jan Paderweski\\n\")) # a typo in the surname" + ] + }, + { + "cell_type": "markdown", + "id": "ad2aad24", + "metadata": {}, + "source": [ + "### Whitespace Encoding\n", + "\n", + "All of the documents in the NatureBook corpus use `\\n` as the end-of-line character." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee003e54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: Write a python function that checks if an input string is a palindrome.\r\n", + "\r\n", + "Answer:\n", + "\n", + "def is_palindrome(s):\n", + " if len(s) == 0:\n", + " return True\n", + " if s[0] != s[-1]:\n", + " return False\n", + " return is_palindrome(s[1:])\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Question: Write a python function that checks if an input string is a palindrome.\\r\\n\\r\\nAnswer:\",\n", + " max_new_tokens=65\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "71b6fbaf", + "metadata": {}, + "source": [ + "Interestingly using a Beam Search helps to produce a working code:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0b8ab61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: Write a python function that checks if an input string is a palindrome.\r\n", + "\r\n", + "Answer: ```python\n", + "def is_palindrome(s):\n", + " if len(s) == 0:\n", + " return True\n", + " else:\n", + " return s[0] == s[-1] and is_palindrome(s[1:-1])\n", + "```\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\n", + " \"Question: Write a python function that checks if an input string is a palindrome.\\r\\n\\r\\nAnswer:\",\n", + " max_new_tokens=65, num_beams=5\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58ad6f2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: Write a python function that checks if an input string is a palindrome.\n", + "\n", + "Answer: def is_palindrome(s):\n", + " if len(s) < 2:\n", + " return True\n", + " return s[0] == s[-1] and is_palindrome(s[1:-1])\n" + ] + } + ], + "source": [ + "# multiple spaces between words\n", + "print(\n", + " model.generate(\n", + " \"Question: Write a python function that checks if an input string is a palindrome.\\n\\nAnswer:\",\n", + " max_new_tokens=55\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "80d5e160", + "metadata": {}, + "source": [ + "---\n", + "The solution is correct, just different from the one obtained using prompt with normalized spaces.\n", + "\n", + "It's a good practice not to include a trailing space in the prompt." + ] + }, + { + "cell_type": "markdown", + "id": "16b910b0", + "metadata": {}, + "source": [ + "### TeX formula markers\n", + "\n", + "Most of the documents in the NatureBook corpus use `\\(` and `\\)` for inline TeX formulas and `\\[` and `\\]` for display mode maths, but some of the data sources use `$` and `$$` instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13d9a6e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval $[a^2, b+c]$?

\n", + "

Answer: $\\frac{b+c+a^2}{2}$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# using \\( \\)\n", + "display_markdown(\n", + " model.generate(\n", + "\"\"\"Question: What is the expected value of a random variable uniformly distributed over the interval \\\\([a^2, b+c]\\\\)?\n", + "\n", + "Answer:\"\"\", max_new_tokens=20)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a31bff7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval $$[a^2, b+c]$$?

\n", + "

Answer: $\\frac{a^2+b+c}{2}$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# using \\[ \\]\n", + "display_markdown(\n", + " model.generate(\n", + "\"\"\"Question: What is the expected value of a random variable uniformly distributed over the interval \\\\[[a^2, b+c]\\\\]?\n", + "\n", + "Answer:\"\"\", max_new_tokens=20)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c14d9ae5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval $[a^2, b+c]$?

\n", + "

Answer: $\\begin{aligned} \\operatorname{E}(X) &= \\int_{x=a^2}^{b+c} x \\cdot \\dfrac{1}{b+c-a^2} \\, dx \\\\ &= \\dfrac{1}{b+c-a^2} \\cdot \\left[\\dfrac{x^2}{2} \\right]_{x=a^2}^{b+c} \\\\ &= \\dfrac{1}{b+c-a^2} \\cdot \\left(\\dfrac{(b+c)^2}{2} - \\dfrac{(a^2)^2}{2} \\right) \\\\ &= \\dfrac{1}{b+c-a^2} \\cdot \\left(\\dfrac{b^2+2bc+c^2}{2} - \\dfrac{a^4}{2} \\right) \\\\ &= \\dfrac{1}{b+c-a^2} \\cdot \\left(\\dfrac{b^2+c^2}{2} + bc - \\dfrac{a^4}{2} \\right) \\end{aligned}$

\n", + "

In conclusion, the expected value of $X$ is $\\dfrac{1}{b+c-a^2} \\cdot \\left(\\dfrac{b^2+c^2}{2} + bc - \\dfrac{a^4}{2} \\right)$.

\n", + "

</work>

\n", + "

Ans: the expected value of $X$ is $\\dfrac{1}{b+c-a^2} \\cdot \\left(\\dfrac{b^2+c^2}{2} + bc - \\dfrac{a^4}{2} \\right)$.

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# using $\n", + "display_markdown(\n", + " model.generate(\n", + "\"\"\"Question: What is the expected value of a random variable uniformly distributed over the interval $[a^2, b+c]$?\n", + "\n", + "Answer:\"\"\", max_new_tokens=500)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3092c2cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval $$[a^2, b+c]$$?

\n", + "

Answer:

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# using $$\n", + "display_markdown(\n", + " model.generate(\n", + "\"\"\"Question: What is the expected value of a random variable uniformly distributed over the interval $$[a^2, b+c]$$?\n", + "\n", + "Answer:\"\"\", max_new_tokens=40)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "141bf95d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval [a^2, b+c]?

\n", + "

Answer: $\\frac{b+c+a^{2}}{2}$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# plaintext math\n", + "display_markdown(\n", + " model.generate(\n", + "\"\"\"Question: What is the expected value of a random variable uniformly distributed over the interval [a^2, b+c]?\n", + "\n", + "Answer:\"\"\", max_new_tokens=22)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64e17893", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Question: What is the expected value of a random variable uniformly distributed over the interval $[a^2, b+c]$?

\n", + "

Answer: $\\dfrac{a^2+b+c}{2}$

\n", + "

</work>

\n", + "

Answer: $\\dfrac{a^2+b+c}{2}$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# using $, beam search\n", + "display_markdown(\n", + " model.generate(\n", + "\"\"\"Question: What is the expected value of a random variable uniformly distributed over the interval $[a^2, b+c]$?\n", + "\n", + "Answer:\"\"\", max_new_tokens=50, num_beams=5)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ca469fb3", + "metadata": {}, + "source": [ + "Please note that `display_latex` and `display_markdown` convert the markers to `$` and `$$` only for the display purposes." + ] + }, + { + "cell_type": "markdown", + "id": "f08f6e3e", + "metadata": {}, + "source": [ + "### Letter-case" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c03be57", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: what is Alzheimer's Disease?\n", + "\n", + "Answer: Alzheimer's disease (AD) is a neurodegenerative disorder that is characterized by progressive cognitive decline and memory loss. The disease is the most common cause of dementia in the elderly. The neuropathological hallmarks of AD are the presence of extracellular amyloid plaques and intracellular neurofibrillary tangles (\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\"Question: what is Alzheimer's Disease?\\n\\n\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f34a25e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: what is alzheimer's disease?\n", + "\n", + "Answer: Alzheimer's disease (AD) is a neurodegenerative disorder characterized by progressive cognitive decline and memory loss. The neuropathological hallmarks of AD are the presence of extracellular amyloid plaques and intracellular neurofibrillary tangles (NFTs). Amyloid plaques are composed of amyloid-β (Aβ\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\"Question: what is alzheimer's disease?\\n\\n\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f71ced5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question: what is ALZHEIMER'S DISEASE?\n", + "\n", + "Answer: Alzheimer's disease (AD) is a progressive neurodegenerative disorder that is the most common cause of dementia in the elderly. It is characterized by the presence of two types of protein deposits in the brain: extracellular amyloid plaques and intracellular neurofibrillary tangles. The amyloid plaques are composed of\n" + ] + } + ], + "source": [ + "print(\n", + " model.generate(\"Question: what is ALZHEIMER'S DISEASE?\\n\\n\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8d6258b4", + "metadata": {}, + "source": [ + "### New document mode\n", + "\n", + "To make the training efficient, all the documents in the training corpus were concatenated into a single sequence of tokens, with `
` token as a document boundary marker.\n", + "\n", + "If you want autocomplete based functionality, it is often good to experiment with turning off the new document mode by setting `new_doc=False` in calls to `generate`. This puts the generation into continuation mode, which means that the prompt may be in the middle of a document, as opposed to the beginning." + ] + }, + { + "cell_type": "markdown", + "id": "a307b07b", + "metadata": {}, + "source": [ + "## Other Limitations" + ] + }, + { + "cell_type": "markdown", + "id": "b2f99ffb", + "metadata": {}, + "source": [ + "### [START_REF] position\n", + "\n", + "As described above we use the `[START_REF]` token to generate in-context references. The token is a form of \"insert citation here\" instruction to the model. This means that the word order may impact what paper is generated." + ] + }, + { + "cell_type": "markdown", + "id": "bc12a702", + "metadata": {}, + "source": [ + "### Correct Answer with Incorrect reasoning\n", + "\n", + "In the example below we use the question answering template to get a solution for a probability question:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67a91956", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt: Question: We flip a fair coin $3$ times. What is the probability of getting an even number of heads?

\n", + "

Answer:

\n", + "

Answer: Question: We flip a fair coin $3$ times. What is the probability of getting an even number of heads?

\n", + "

Answer: $\\dfrac{1}{2}$

\n", + "

</work>

\n", + "

Ans: $\\dfrac{1}{2}$

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = f\"Question: We flip a fair coin $3$ times. What is the probability of getting an even number of heads?\\n\\nAnswer:\"\n", + "answer = model.generate(prompt, new_doc=True)\n", + "display_markdown(f\"**Prompt**: {prompt}\\n\\n**Answer**: {answer}\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "606aef41", + "metadata": {}, + "source": [ + "The model provides a correct answer. Trying to get reasoning with token we get different answer:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f5f16e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt: Question: We flip a fair coin $3$ times. What is the probability of getting an even number of heads?

\n", + "

<work>

\n", + "

Answer: Question: We flip a fair coin $3$ times. What is the probability of getting an even number of heads?

\n", + "

<work>

\n", + "

Flipping a fair coin $3$ times has the same probability as picking a card from a standard deck of $52$ cards and then randomly putting it back.

\n", + "

There are $2$ cards that have an even number of heads: $\\text{HHH}$ and $\\text{TTT}$.

\n", + "

There are $2^3=8$ total ways to flip a coin $3$ times.

\n", + "

The probability of getting an even number of heads is $\\dfrac{2}{8}=\\dfrac{1}{4}$.

\n", + "

</work>

\n", + "

Ans: The probability of getting an even number of heads is

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = f\"Question: We flip a fair coin $3$ times. What is the probability of getting an even number of heads?\\n\\n\"\n", + "answer = model.generate(prompt, new_doc=True, max_new_tokens=150)\n", + "display_markdown(f\"**Prompt**: {prompt}\\n\\n**Answer**: {answer}\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "b51fa923", + "metadata": {}, + "source": [ + "Sometimes a prompt might be \"too robust\". Let's reconsider the dot-product attention example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d26f3d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

The paper that presented a novel computing block given by the formula:\n", + "$$\n", + "f(Q, K, V) = \\textrm{softmax}\\left(\\frac{QK^T}{\\sqrt{d_k}}\\right)V\n", + "$$

\n", + "

Reference: Attention is All you Need, Vaswani

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt1 = \"\"\"The paper that presented a novel computing block given by the formula:\n", + "\\\\[\n", + "f(Q, K, V) = \\\\textrm{softmax}\\\\left(\\\\frac{QK^T}{\\\\sqrt{d_k}}\\\\right)V\n", + "\\\\]\n", + "\n", + "\"\"\"\n", + "ref = model.generate_reference(prompt1)\n", + "display_markdown(f\"{prompt1}\\n**Reference**: {ref}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a5e137c4", + "metadata": {}, + "source": [ + "How much we can change the formula to still get that reference? Because of the hallucination issue described above the formula can be changed a lot:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3308cfe0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

The paper that presented a novel computing block given by the formula:\n", + "$$\n", + "f(X) = \\cos\\left(\\frac{X}{d_k}\\right)\n", + "$$

\n", + "

Reference: Attention is All you Need, Vaswani

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt2 = \"\"\"The paper that presented a novel computing block given by the formula:\n", + "\\\\[\n", + "f(X) = \\\\cos\\\\left(\\\\frac{X}{d_k}\\\\right)\n", + "\\\\]\n", + "\n", + "\"\"\"\n", + "ref = model.generate_reference(prompt2)\n", + "display_markdown(f\"{prompt2}\\n**Reference**: {ref}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5040f8a1", + "metadata": {}, + "source": [ + "In the example above the model is forced into reference generation with a false premise that such a paper introducing the formula exists. One option to mitigate the issue is to provide a few-shot prompt with examples in which the answer can be \"no such paper\". Another option is to set up a generation threshold on model's logits:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dca6877", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "def score_generation(model, prompt):\n", + " tokens = model._tokenize([prompt], new_doc=False)\n", + " out = model.model.generate(\n", + " tokens,\n", + " max_new_tokens=40,\n", + " return_dict_in_generate=True,\n", + " output_scores=True\n", + " )\n", + " generation = out.sequences[0, len(tokens[0]):].tolist()\n", + " scores = []\n", + " end_ref_id = model.tokenizer.token_to_id(\"[END_REF]\")\n", + " for pos, token_id in enumerate(generation):\n", + " log_probs = torch.nn.functional.log_softmax(out.scores[pos], dim=-1)\n", + " scores.append(log_probs[0, token_id].item())\n", + " if token_id == end_ref_id:\n", + " break\n", + " text = model.tokenizer.decode(generation[:pos], skip_special_tokens=False)\n", + " scores = torch.tensor(scores)\n", + " return display_markdown(f\"\"\"**Prompt**: {prompt}\n", + "\n", + "**Predicted reference**: {text}\n", + "\n", + "**Min score**: {scores.min().item():.2f}\n", + "\n", + "**Sum score**: {scores.sum().item():.2f}\n", + "\n", + "\"\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56d44aaa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt: The paper that presented a novel computing block given by the formula:\n", + "$$\n", + "f(Q, K, V) = \\textrm{softmax}\\left(\\frac{QK^T}{\\sqrt{d_k}}\\right)V\n", + "$$

\n", + "

[START_REF]

\n", + "

Predicted reference: Attention is All you Need, Vaswani

\n", + "

Min score: -0.29

\n", + "

Sum score: -0.35

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score_generation(model, prompt1 + \"[START_REF]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2d8ec13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt: The paper that presented a novel computing block given by the formula:\n", + "$$\n", + "f(X) = \\cos\\left(\\frac{X}{d_k}\\right)\n", + "$$

\n", + "

[START_REF]

\n", + "

Predicted reference: Attention is All you Need, Vaswani

\n", + "

Min score: -1.45

\n", + "

Sum score: -1.52

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score_generation(model, prompt2 + \"[START_REF]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a58d329", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt: The $E=m c^2$ paper [START_REF]

\n", + "

Predicted reference: Ist die Trägheit eines Körpers von seinem Energieinhalt abhängig, Einstein

\n", + "

Min score: -0.42

\n", + "

Sum score: -0.58

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score_generation(model, \"The $E=m c^2$ paper [START_REF]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3336be1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Prompt: The $E=m c^3$ paper [START_REF]

\n", + "

Predicted reference: Ist die Trägheit eines Körpers von seinem Energieinhalt abhängig, Einstein

\n", + "

Min score: -2.01

\n", + "

Sum score: -2.15

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score_generation(model, \"The $E=m c^3$ paper [START_REF]\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d19da87", + "metadata": {}, + "source": [ + "---\n", + "It can happen, especially with few-shot prompts, that the models continue to generate text after the expected answer. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02877652", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The IUPAC name of cortisol is: 11β,17α,21-trihydroxypregn-4-ene-3,20-dione.\n", + "\n", + "## See also\n", + "\n", + "* Cortisone\n", + "* Corticosterone\n", + "* Hydrocortisone\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(model.generate(\"The IUPAC name of cortisol is:\"))" + ] + }, + { + "cell_type": "markdown", + "id": "2f2d48da", + "metadata": {}, + "source": [ + "For this reason some of the generations above were specifying `max_new_tokens` manually to make the examples more readable and easier to follow. In practice a post-processing step may be needed, depending on use case." + ] + }, + { + "cell_type": "markdown", + "id": "d8d734c9", + "metadata": {}, + "source": [ + "The generation might assume a different type of document than expected:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d24486e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The IUPAC name of cortisol is:\n", + "\n", + "A. 17-Hydroxyprogesterone\n", + "B. 11-Deoxycortisol\n", + "C. 11-Deoxycorticosterone\n", + "D. 17-Hydroxycorticosterone\n", + "\n", + "Answer: B\n" + ] + } + ], + "source": [ + "print(model.generate(\"The IUPAC name of cortisol is:\\n\\n\"))" + ] + }, + { + "cell_type": "markdown", + "id": "9d3e070d", + "metadata": {}, + "source": [ + "For more details see [our paper](https://galactica.org/paper.pdf)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1028002", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/Introduction to Galactica Models.pdf b/notebooks/Introduction to Galactica Models.pdf new file mode 100644 index 0000000..077b52f Binary files /dev/null and b/notebooks/Introduction to Galactica Models.pdf differ diff --git a/requirements.txt b/requirements.txt index 657c04b..ec8273f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,8 @@ -torch +torch>=1.12 +transformers==4.25.1 tokenizers -bert-score -openai -tqdm -datasets -prompt_toolkit -promptsource -spacy==3.3.0 -rouge_score -nltk -parallelformers +parallelformers==1.2.7 accelerate -more_itertools -thefuzz \ No newline at end of file +markdown>=3.4 +bleach[css]~=5.0.1 +psutil diff --git a/setup.py b/setup.py index 208ba10..c263d02 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,8 @@ from setuptools import setup, find_packages PACKAGE_NAME = 'galai' -VERSION = "1.0.0" -DESCRIPTION = "API for the GALILEO model" +VERSION = "1.1.7.dev0" +DESCRIPTION = "API for the GALACTICA model" KEYWORDS = "Scientific Intelligence" URL = 'https://github.com/paperswithcode/galai' AUTHOR = 'Papers with Code'