diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..c955c84 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.10 + +RUN pip install --no-cache-dir --upgrade pip +RUN apt update && apt install -y zsh curl git sudo wget vim + +ARG USERNAME=vscode +ARG USER_UID=1000 +ARG USER_GID=$USER_UID + +RUN groupadd --gid $USER_GID $USERNAME \ + && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME + +RUN usermod -aG sudo $USERNAME +RUN echo 'vscode ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +USER $USERNAME + +RUN cd ~ && wget https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh && sh install.sh + +# Create and activate a Python virtual environment +RUN python -m venv ~/venv +RUN echo "source ~/venv/bin/activate" >> ~/.zshrc + +RUN echo "export PYTHONPATH=\$PYTHONPATH:/workspace" >> ~/.zshrc +RUN /bin/zsh ~/.zshrc + +ENV DEBIAN_FRONTEND=dialog diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..968caa1 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,55 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/postgres +{ + "name": "Geektime Course Notebook", + "dockerComposeFile": "docker-compose.yml", + "service": "app", + "workspaceFolder": "/workspace", + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.defaultProfile.linux": "zsh", + "terminal.integrated.profiles.linux": { + "zsh": { + "path": "/bin/zsh" + } + } + }, + "extensions": [ + "GitHub.copilot", + "GitHub.copilot-labs", + "GitHub.vscode-pull-request-github", + "ms-python.python", + "ms-python.vscode-pylance", + "ms-python.pylint", + "ms-python.isort", + "ms-python.black-formatter", + "matangover.mypy", + "ms-toolsai.jupyter", + "ms-toolsai.jupyter-keymap", + "ms-toolsai.vscode-jupyter-slideshow", + "eamodio.gitlens" + ] + } + }, + "containerEnv": { + "OPENAI_API_KEY": "${localEnv:OPENAI_API_KEY}", + "JUPYTER_HOME": "${localEnv:JUPYTER_HOME}" + }, + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // This can be used to network with other containers or the host. + "forwardPorts": [8080, 3306, 6379], + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "./.devcontainer/postCreateCommand.sh" + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} + diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml new file mode 100644 index 0000000..8dccfe0 --- /dev/null +++ b/.devcontainer/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3.8' + +services: + app: + build: + context: .. + dockerfile: .devcontainer/Dockerfile + volumes: + - ..:/workspace + ports: + - "8888:8888" + user: vscode + command: sleep infinity \ No newline at end of file diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh new file mode 100755 index 0000000..3a48a44 --- /dev/null +++ b/.devcontainer/postCreateCommand.sh @@ -0,0 +1,5 @@ +#!/bin/bash +source /home/vscode/venv/bin/activate +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +pip install -r requirements.txt +sudo chown vscode:vscode /workspace \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e1d1fae --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +OPENAI_API_KEY=YOUR_API_KEY +JUPYTER_HOME=/workspace \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1d2eee0..3367168 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,13 @@ .DS_Store +.env .ipynb_checkpoints +cc.en.300.bin data/podcast_clip_*.mp3 data/podcast_long.* data/paddlespeech*.wav data/tts.* -data/transcripts \ No newline at end of file +data/transcripts +data/*.jsonl +data/20_newsgroup.csv +data/output.png +data/toutiao_cat_data* diff --git a/01_open_ai_101.ipynb b/01_open_ai_101.ipynb index d81772a..2a0054a 100644 --- a/01_open_ai_101.ipynb +++ b/01_open_ai_101.ipynb @@ -1,35 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: openai in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (0.27.0)\n", - "Requirement already satisfied: requests>=2.20 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from openai) (2.28.2)\n", - "Requirement already satisfied: aiohttp in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from openai) (3.8.4)\n", - "Requirement already satisfied: tqdm in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from openai) (4.64.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from requests>=2.20->openai) (3.0.1)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from requests>=2.20->openai) (1.26.14)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from requests>=2.20->openai) (3.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from requests>=2.20->openai) (2022.12.7)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from aiohttp->openai) (1.8.2)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from aiohttp->openai) (6.0.4)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from aiohttp->openai) (1.3.3)\n", - "Requirement already satisfied: attrs>=17.3.0 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from aiohttp->openai) (22.2.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from aiohttp->openai) (1.3.1)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/xuwenhao/miniconda3/envs/py310/lib/python3.10/site-packages (from aiohttp->openai) (4.0.2)\n" - ] - } - ], - "source": [ - "!conda install openai" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -62,10 +32,12 @@ } ], "source": [ - "import openai\n", + "from openai import OpenAI\n", "import os\n", "\n", - "openai.api_key = os.environ.get(\"OPENAI_API_KEY\")\n", + "client = OpenAI(\n", + " api_key=os.environ['OPENAI_API_KEY'],\n", + ")\n", "COMPLETION_MODEL = \"text-davinci-003\"\n", "\n", "\n", @@ -80,8 +52,8 @@ "\"\"\"\n", "\n", "def get_response(prompt):\n", - " completions = openai.Completion.create (\n", - " engine=COMPLETION_MODEL,\n", + " completions = client.completions.create (\n", + " model=COMPLETION_MODEL,\n", " prompt=prompt,\n", " max_tokens=512,\n", " n=1,\n", @@ -144,7 +116,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.1" }, "vscode": { "interpreter": { diff --git a/02_zero_shot_classification.ipynb b/02_zero_shot_classification.ipynb index 974f41a..f372939 100644 --- a/02_zero_shot_classification.ipynb +++ b/02_zero_shot_classification.ipynb @@ -18,19 +18,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "好评例子的评分 : 0.070963\n", - "差评例子的评分 : -0.081472\n" + "好评例子的评分 : 0.040873\n", + "差评例子的评分 : -0.016688\n" ] } ], "source": [ - "import openai\n", + "from openai import OpenAI\n", + "import numpy as np\n", "import os\n", - "from openai.embeddings_utils import cosine_similarity, get_embedding\n", "\n", - "openai.api_key = os.environ.get(\"OPENAI_API_KEY\")\n", + "client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])\n", + "\n", "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", "\n", + "def get_embedding(text, model=EMBEDDING_MODEL):\n", + " text = text.replace(\"\\n\", \" \")\n", + " return client.embeddings.create(input = [text], model=model).data[0].embedding\n", + "\n", + "def cosine_similarity(vector_a, vector_b):\n", + " dot_product = np.dot(vector_a, vector_b)\n", + " norm_a = np.linalg.norm(vector_a)\n", + " norm_b = np.linalg.norm(vector_b)\n", + " epsilon = 1e-10\n", + " cosine_similarity = dot_product / (norm_a * norm_b + epsilon)\n", + " return cosine_similarity\n", + "\n", "positive_review = get_embedding(\"好评\")\n", "negative_review = get_embedding(\"差评\")\n", "\n", @@ -65,8 +78,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "好评餐馆的评分 : 0.062719\n", - "差评餐馆的评分 : -0.074591\n" + "好评餐馆的评分 : 0.051292\n", + "差评餐馆的评分 : -0.006322\n" ] } ], @@ -100,7 +113,7 @@ "\n", "from sklearn.metrics import classification_report\n", "\n", - "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", + "datafile_path = os.environ.get(\"JUPYTER_HOME\") + \"/data/fine_food_reviews_with_embeddings_1k.csv\"\n", "\n", "df = pd.read_csv(datafile_path)\n", "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n", @@ -141,7 +154,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -157,7 +170,7 @@ " labels = ['negative', 'positive'], \n", " model = EMBEDDING_MODEL,\n", "):\n", - " label_embeddings = [get_embedding(label, engine=model) for label in labels]\n", + " label_embeddings = [get_embedding(label, model=model) for label in labels]\n", "\n", " def label_score(review_embedding, label_embeddings):\n", " return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])\n", @@ -191,7 +204,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.1" }, "vscode": { "interpreter": { diff --git a/03_food_chatbot.py b/03_food_chatbot.py new file mode 100644 index 0000000..e840ac8 --- /dev/null +++ b/03_food_chatbot.py @@ -0,0 +1,44 @@ +from openai import OpenAI +import os + +client = OpenAI(api_key = os.environ["OPENAI_API_KEY"]) + +def ask_gpt3(prompt): + response = client.completions.create( + model="text-davinci-003", + prompt=prompt, + max_tokens=512, + n=1, + stop=None, + temperature=0.5, + ) + + message = response.choices[0].text.strip() + return message + +print("你好,我是一个聊天机器人,请你提出你的问题吧?") + +questions = [] +answers = [] + + +def generate_prompt(prompt, questions, answers): + num = len(answers) + for i in range(num): + prompt += "\n Q : " + questions[i] + prompt += "\n A : " + answers[i] + prompt += "\n Q : " + questions[num] + "\n A : " + return prompt + +while True: + user_input = input("> ") + questions.append(user_input) + if user_input.lower() in ["bye", "goodbye", "exit"]: + print("Goodbye!") + break + + prompt = generate_prompt("", questions, answers) + + answer = ask_gpt3(prompt) + print(answer) + answers.append(answer) \ No newline at end of file diff --git a/03_prompt_chatbot.ipynb b/03_prompt_chatbot.ipynb index 56c80a6..c931db1 100644 --- a/03_prompt_chatbot.ipynb +++ b/03_prompt_chatbot.ipynb @@ -13,18 +13,19 @@ "metadata": {}, "outputs": [], "source": [ - "import openai\n", + "from openai import OpenAI\n", "import os\n", "\n", - "openai.api_key = os.environ.get(\"OPENAI_API_KEY\")\n", + "client = OpenAI(api_key = os.environ.get(\"OPENAI_API_KEY\"))\n", + "\n", "COMPLETION_MODEL = \"text-davinci-003\"\n", "\n", "\n", "prompt = '请你用朋友的语气回复给到客户,并称他为“亲”,他的订单已经发货在路上了,预计在3天之内会送达,订单号2021AEDG,我们很抱歉因为天气的原因物流时间比原来长,感谢他选购我们的商品。'\n", "\n", "def get_response(prompt, temperature = 1.0, stop=None):\n", - " completions = openai.Completion.create (\n", - " engine=COMPLETION_MODEL,\n", + " completions = client.completions.create (\n", + " model=COMPLETION_MODEL,\n", " prompt=prompt,\n", " max_tokens=1024,\n", " n=1,\n", @@ -32,8 +33,7 @@ " temperature=temperature, \n", " )\n", " message = completions.choices[0].text\n", - " return message\n", - " \n" + " return message" ] }, { @@ -47,7 +47,7 @@ "text": [ "\n", "\n", - "亲,您的订单2021AEDG已发货,预计在三天内会送达。很抱歉因为天气的原因,物流可能会迟了一点。谢谢您选购我们的商品。\n" + "亲,您的订单已发货,订单号2021AEDG,大概3天内就能收到包裹。由于天气影响,物流需要的时间比原来预期的要长,请您多多包涵,非常感谢您选购我们的商品。\n" ] } ], @@ -66,7 +66,7 @@ "text": [ "\n", "\n", - "亲,您已下单成功,订单号2021AEDG,我们已将货物发出,预计将在3天内送达。目前,由于天气的原因,物流时间比原来有些延迟,这里我们真的很抱歉。最后,感谢您对我们的支持与关注!祝您生活愉快!\n" + "亲,您的订单(2021AEDG)已经发货在路上了,预计在3天内会到达。很抱歉这次因为天气的原因物流花费的时间比原来长了一点,感谢您对我们的商品选购,祝您购物愉快!\n" ] } ], @@ -161,15 +161,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "1. 锅中放油,加入蒜末、葱末、冬菇粉和大蒜粉,加入腊肉片爆香。\n", + "鱼香肉丝做法:\n", + "\n", + "1.准备好食材:500克猪肉(剁碎)、200克黄瓜(切丝)、2个胡萝卜(切丝)、2茶匙盐、1茶匙料酒、1汤匙蚝油、3汤匙鱼香调料、1汤匙葱花、油适量、水适量。\n", "\n", - "2. 加入肉末,芹菜末,炒香。\n", + "2. 猪肉倒入碗中加入盐、料酒、蚝油、鱼香调料搅拌均匀腌制10分钟。\n", "\n", - "3. 加入番茄酱、水淀粉、盐、糖、老抽、料酒。\n", + "3.黄瓜、胡萝卜条放入一次性食用手套中搅拌均匀,加入少许料酒、盐搅拌均匀腌制10分钟。\n", "\n", - "4. 加入熟白菜和黄豆粉,翻炒。\n", + "4.锅中倒入油,猪肉放入锅中参照切片厚度,开大火炒出油脂,改中小火炒至变色\n", "\n", - "5. 最后加入香菜末炒匀出锅即可。\n" + "5. 锅中倒入黄瓜和胡萝卜丝,加入少许水翻炒至食材变软,加入葱花翻炒均匀即可。\n" ] } ], @@ -197,10 +199,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.准备好食材:500克牛肉,2勺蚝油,1勺老抽,1勺葱花,配料:2勺姜末,2勺料酒,1勺糖,半勺盐,1勺醋,2勺香油,半勺糯米粉,半勺绍酒。\n", - "2.将牛肉洗净,切成片,放入料酒、盐、姜末拌匀腌制20分钟。\n", - "3.把蚝油、老抽、葱花、糖、醋、香油、糯米粉、绍酒放入碗中,搅拌均匀。\n", - "4.将牛肉片放入船形锅内,慢火翻炒至半熟,倒入调味料,用中火收汁,最后加入香菜,大火翻炒几下即可。\n" + "1.准备好所需食材:400克牛肉,1个洋葱,2勺蚝油,1勺糖,1勺盐,2勺料酒,少许花椒粉,3勺葱末,3勺姜末,3勺蒜末,3勺生抽,3勺老抽,3勺辣椒粉,2勺水淀粉,适量油。\n", + "2.将牛肉洗净后,放入清水中浸泡20分钟后再沥干水分,然后切成片,在肉上施加料酒和盐,抓捏抓匀,腌制20分钟备用。\n", + "3.将洋葱洗净,切成葱花,蒜、葱、姜末拌入水中腌制备用。\n", + "4.将油烧热,放入牛肉片,大火翻炒至肉片表面熟化,加入葱花、姜末和蒜末炒片刻,加入糖、蚝油、生抽、老抽、辣椒粉、料酒,改小火继续翻炒至肉片表面焦黄,倒入水淀粉勾芡,调味即可。\n" ] } ], @@ -220,141 +222,6 @@ "print(get_response(question))" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## food_chatbot.py" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "你好,我是一个聊天机器人,请你提出你的问题吧?\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "> 你好\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "你好\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "> 请问鱼香肉丝怎么做?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "鱼香肉丝的做法:\n", - "1.准备好所需的食材:猪肉、葱、姜、蒜、腐竹、酱油、白糖、料酒、醋、香油等。\n", - "2.将猪肉切成片,加入料酒、盐、鸡精腌制半小时。\n", - "3.准备调料,将葱、姜、蒜、腐竹切成末,将酱油、白糖、醋、香油混合均匀备用。\n", - "4.锅中加入油,放入葱、姜、蒜、腐竹炒香。\n", - "5.放入腌制好的猪肉,翻炒均匀,加入调料拌匀。\n", - "6.加入适量清水,盖上锅盖焖煮5分钟,即可。\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "> 那蚝油牛肉呢?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "蚝油牛肉的做法:\n", - "1.准备好所需的食材:牛肉、葱、姜、蒜、蚝油、料酒、醋、香油等。\n", - "2.将牛肉切成片,加入料酒、盐、鸡精腌制半小时。\n", - "3.准备调料,将葱、姜、蒜切成末,将蚝油、醋、香油混合均匀备用。\n", - "4.锅中加入油,放入葱、姜、蒜炒香。\n", - "5.放入腌制好的牛肉,翻炒均匀,加入调料拌匀。\n", - "6.加入适量清水,盖上锅盖焖煮5分钟,即可。\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "> bye\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Goodbye!\n" - ] - } - ], - "source": [ - "import openai\n", - "import os\n", - "\n", - "openai.api_key = os.environ[\"OPENAI_API_KEY\"]\n", - "\n", - "def ask_gpt3(prompt):\n", - " response = openai.Completion.create(\n", - " engine=\"text-davinci-003\",\n", - " prompt=prompt,\n", - " max_tokens=512,\n", - " n=1,\n", - " stop=None,\n", - " temperature=0.5,\n", - " )\n", - "\n", - " message = response.choices[0].text.strip()\n", - " return message\n", - "\n", - "print(\"你好,我是一个聊天机器人,请你提出你的问题吧?\")\n", - "\n", - "questions = []\n", - "answers = []\n", - "\n", - "\n", - "def generate_prompt(prompt, questions, answers):\n", - " num = len(answers)\n", - " for i in range(num):\n", - " prompt += \"\\n Q : \" + questions[i]\n", - " prompt += \"\\n A : \" + answers[i]\n", - " prompt += \"\\n Q : \" + questions[num] + \"\\n A : \" \n", - " return prompt\n", - "\n", - "while True:\n", - " user_input = input(\"> \")\n", - " questions.append(user_input)\n", - " if user_input.lower() in [\"bye\", \"goodbye\", \"exit\"]:\n", - " print(\"Goodbye!\")\n", - " break\n", - " \n", - " prompt = generate_prompt(\"\", questions, answers)\n", - "\n", - " answer = ask_gpt3(prompt)\n", - " print(answer)\n", - " answers.append(answer)\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -364,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -397,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "tags": [] }, @@ -436,7 +303,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.1" }, "vscode": { "interpreter": { diff --git a/04_classification_comparison.ipynb b/04_classification_comparison.ipynb index 50a45ea..dcafe69 100644 --- a/04_classification_comparison.ipynb +++ b/04_classification_comparison.ipynb @@ -1,413 +1,511 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting package metadata (current_repodata.json): done\n", - "Solving environment: done\n", - "\n", - "# All requested packages already installed.\n", - "\n", - "Collecting package metadata (current_repodata.json): done\n", - "Solving environment: done\n", - "\n", - "# All requested packages already installed.\n", - "\n" - ] - } - ], - "source": [ - "!conda install gensim\n", - "!conda install fasttext" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fasttext效果测试" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VHClECUkHcPp", + "outputId": "5b33ed1c-c8b3-42a8-a450-9111823f7708" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting fasttext==0.9.2\n", + " Downloading fasttext-0.9.2.tar.gz (68 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/68.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━\u001b[0m \u001b[32m61.4/68.8 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m68.8/68.8 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting pybind11>=2.2 (from fasttext==0.9.2)\n", + " Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)\n", + "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (67.7.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (1.23.5)\n", + "Building wheels for collected packages: fasttext\n", + " Building wheel for fasttext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199774 sha256=a2abeb1f730b352409085af8f4b7dc464bdeddaa985c73b40a9bdbad5d9a8a67\n", + " Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394\n", + "Successfully built fasttext\n", + "Installing collected packages: pybind11, fasttext\n", + "Successfully installed fasttext-0.9.2 pybind11-2.11.1\n", + "Requirement already satisfied: gensim==4.3.2 in /usr/local/lib/python3.10/dist-packages (4.3.2)\n", + "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.2) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.2) (1.11.4)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.2) (6.4.0)\n" + ] + } + ], + "source": [ + "%pip install fasttext==0.9.2\n", + "%pip install gensim==4.3.2" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2023-03-20 10:56:41-- https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\n", - "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142\n", - "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 4503593528 (4.2G) [application/octet-stream]\n", - "Saving to: ‘cc.en.300.bin.gz’\n", - "\n", - "cc.en.300.bin.gz 22%[===> ] 978.62M 2.72MB/s in 6m 50s \n", - "\n", - "2023-03-20 11:03:33 (2.39 MB/s) - Read error at byte 1026161458/4503593528 (error:1408F119:SSL routines:ssl3_get_record:decryption failed or bad record mac). Retrying.\n", - "\n", - "--2023-03-20 11:03:34-- (try: 2) https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\n", - "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.\n", - "HTTP request sent, awaiting response... 206 Partial Content\n", - "Length: 4503593528 (4.2G), 3477432070 (3.2G) remaining [application/octet-stream]\n", - "Saving to: ‘cc.en.300.bin.gz’\n", - "\n", - "cc.en.300.bin.gz 56%[++++======> ] 2.37G 5.96MB/s in 8m 17s \n", - "\n", - "2023-03-20 11:11:52 (2.92 MB/s) - Read error at byte 2547284735/4503593528 (error:1408F119:SSL routines:ssl3_get_record:decryption failed or bad record mac). Retrying.\n", - "\n", - "--2023-03-20 11:11:54-- (try: 3) https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\n", - "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.\n", - "HTTP request sent, awaiting response... 206 Partial Content\n", - "Length: 4503593528 (4.2G), 1956308793 (1.8G) remaining [application/octet-stream]\n", - "Saving to: ‘cc.en.300.bin.gz’\n", - "\n", - "cc.en.300.bin.gz 80%[+++++++++++====> ] 3.37G 7.67MB/s in 2m 41s \n", - "\n", - "2023-03-20 11:14:35 (6.32 MB/s) - Read error at byte 3614428013/4503593528 (error:1408F119:SSL routines:ssl3_get_record:decryption failed or bad record mac). Retrying.\n", - "\n", - "--2023-03-20 11:14:38-- (try: 4) https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\n", - "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.\n", - "HTTP request sent, awaiting response... 206 Partial Content\n", - "Length: 4503593528 (4.2G), 889165515 (848M) remaining [application/octet-stream]\n", - "Saving to: ‘cc.en.300.bin.gz’\n", - "\n", - "cc.en.300.bin.gz 100%[++++++++++++++++===>] 4.19G 6.65MB/s in 2m 7s \n", - "\n", - "2023-03-20 11:16:46 (6.68 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]\n", - "\n", - "--2023-03-20 11:16:46-- http://./\n", - "Resolving . (.)... 123.151.76.67, 61.151.174.112, 14.18.178.36, ...\n", - "Connecting to . (.)|123.151.76.67|:80... failed: Operation timed out.\n", - "Connecting to . (.)|61.151.174.112|:80... failed: Operation timed out.\n", - "Connecting to . (.)|14.18.178.36|:80... ^C\n" - ] - } - ], - "source": [ - "!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "!gunzip ./cc.en.300.bin.gz " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import gensim\n", - "import numpy as np\n", - "# Load the FastText pre-trained model\n", - "model = gensim.models.fasttext.load_facebook_model('cc.en.300.bin')\n", - "\n", - "def get_fasttext_vector(line):\n", - " vec = np.zeros(300) # Initialize an empty 300-dimensional vector\n", - " for word in line.split():\n", - " vec += model.wv[word]\n", - " vec /= len(line.split()) # Take the average over all words in the line\n", - " return vec" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "positive_text = \"\"\"Wanted to save some to bring to my Chicago family but my North Carolina family ate all 4 boxes before I could pack. These are excellent...could serve to anyone\"\"\"\n", - "negative_text = \"\"\"First, these should be called Mac - Coconut bars, as Coconut is the #2 ingredient and Mango is #3. Second, lots of people don't like coconut. I happen to be allergic to it. Word to Amazon that if you want happy customers to make things like this more prominent. Thanks.\"\"\"\n", - "\n", - "positive_example_in_fasttext = get_fasttext_vector(positive_text)\n", - "negative_example_in_fasttext = get_fasttext_vector(negative_text)\n", - "\n", - "positive_review_in_fasttext = get_fasttext_vector(\"An Amazon review with a positive sentiment.\")\n", - "negative_review_in_fasttext = get_fasttext_vector('An Amazon review with a negative sentiment.')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "wXQ8qh_3HcPq" + }, + "source": [ + "### Fasttext效果测试" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fasttext好评例子的评分 : -0.000544\n", - "Fasttext差评例子的评分 : 0.000369\n" - ] - } - ], - "source": [ - "from openai.embeddings_utils import cosine_similarity\n", - "\n", - "def get_fasttext_score(sample_embedding):\n", - " return cosine_similarity(sample_embedding, positive_review_in_fasttext) - cosine_similarity(sample_embedding, negative_review_in_fasttext)\n", - "\n", - "positive_score = get_fasttext_score(positive_example_in_fasttext)\n", - "negative_score = get_fasttext_score(negative_example_in_fasttext)\n", - "\n", - "print(\"Fasttext好评例子的评分 : %f\" % (positive_score))\n", - "print(\"Fasttext差评例子的评分 : %f\" % (negative_score))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### T5效果测试" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# install transformers if required\n", - "#!conda install -y transformers -c conda-forge\n", - "#!conda install -y pytorch torchvision -c pytorch\n", - "#!conda install -y sentencepiece" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import AutoTokenizer, AutoModel\n", - "import torch\n", - "\n", - "# load the T5 tokenizer and model\n", - "tokenizer = AutoTokenizer.from_pretrained('t5-small', model_max_length=512)\n", - "model = AutoModel.from_pretrained('t5-small')\n", - "\n", - "# set the model to evaluation mode\n", - "model.eval()\n", - "\n", - "# encode the input sentence\n", - "def get_t5_vector(line):\n", - " input_ids = tokenizer.encode(line, return_tensors='pt', max_length=512, truncation=True)\n", - " # generate the vector representation\n", - " with torch.no_grad():\n", - " outputs = model.encoder(input_ids=input_ids)\n", - " vector = outputs.last_hidden_state.mean(dim=1)\n", - " return vector[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9LRcwrCvHcPr", + "outputId": "df76731d-ff58-43c9-86f0-34b1834045f8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-12-31 04:59:44-- https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\n", + "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.128, 13.35.7.50, 13.35.7.38, ...\n", + "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4503593528 (4.2G) [application/octet-stream]\n", + "Saving to: ‘cc.en.300.bin.gz’\n", + "\n", + "cc.en.300.bin.gz 100%[===================>] 4.19G 155MB/s in 28s \n", + "\n", + "2023-12-31 05:00:12 (151 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "T5好评例子的评分 : -0.010294\n", - "T5差评例子的评分 : -0.008990\n" - ] - } - ], - "source": [ - "positive_review_in_t5 = get_t5_vector(\"An Amazon review with a positive sentiment.\")\n", - "negative_review_in_t5 = get_t5_vector('An Amazon review with a negative sentiment.')\n", - "\n", - "def test_t5():\n", - " positive_example_in_t5 = get_t5_vector(positive_text)\n", - " negative_example_in_t5 = get_t5_vector(negative_text)\n", - "\n", - " def get_t5_score(sample_embedding):\n", - " return cosine_similarity(sample_embedding, positive_review_in_t5) - cosine_similarity(sample_embedding, negative_review_in_t5)\n", - "\n", - " positive_score = get_t5_score(positive_example_in_t5)\n", - " negative_score = get_t5_score(negative_example_in_t5)\n", - "\n", - " print(\"T5好评例子的评分 : %f\" % (positive_score))\n", - " print(\"T5差评例子的评分 : %f\" % (negative_score))\n", - "\n", - "test_t5()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "6SiTtR8QHcPr" + }, + "outputs": [], + "source": [ + "!gunzip ./cc.en.300.bin.gz" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "5ykoNmB7HcPr" + }, + "outputs": [], + "source": [ + "import gensim\n", + "import numpy as np\n", + "# Load the FastText pre-trained model\n", + "model = gensim.models.fasttext.load_facebook_model('cc.en.300.bin')\n", + "\n", + "def get_fasttext_vector(line):\n", + " vec = np.zeros(300) # Initialize an empty 300-dimensional vector\n", + " for word in line.split():\n", + " vec += model.wv[word]\n", + " vec /= len(line.split()) # Take the average over all words in the line\n", + " return vec" + ] }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4cc8a70db87547678546dd9b433cf7fd", - "version_major": 2, - "version_minor": 0 + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ANbdHEF9HcPr", + "outputId": "56a3313c-7558-4c66-ee91-b0e706dbe04b" }, - "text/plain": [ - "Downloading (…)/main/tokenizer.json: 0%| | 0.00/1.39M [00:00=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.4)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.1.0+cu121)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (0.16.0+cu121)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.13.1)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.2.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2023.6.0)\n", + "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.1.0)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision) (1.23.5)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision) (2.31.0)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision) (9.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2023.11.17)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", + "Collecting sentencepiece\n", + " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: sentencepiece\n", + "Successfully installed sentencepiece-0.1.99\n" + ] + } + ], + "source": [ + "# install transformers if required\n", + "%pip install transformers\n", + "%pip install torch torchvision\n", + "%pip install sentencepiece==0.1.99" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "_G0uqiyBHcPs" + }, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModel\n", + "import torch\n", + "\n", + "# load the T5 tokenizer and model\n", + "tokenizer = AutoTokenizer.from_pretrained('t5-small', model_max_length=512)\n", + "model = AutoModel.from_pretrained('t5-small')\n", + "\n", + "# set the model to evaluation mode\n", + "model.eval()\n", + "\n", + "# encode the input sentence\n", + "def get_t5_vector(line):\n", + " input_ids = tokenizer.encode(line, return_tensors='pt', max_length=512, truncation=True)\n", + " # generate the vector representation\n", + " with torch.no_grad():\n", + " outputs = model.encoder(input_ids=input_ids)\n", + " vector = outputs.last_hidden_state.mean(dim=1)\n", + " return vector[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "T59zGw5OHcPs", + "outputId": "27d80c12-c0fe-4c78-f9f8-c5c2e4f91869" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T5好评例子的评分 : -0.010294\n", + "T5差评例子的评分 : -0.008990\n" + ] + } + ], + "source": [ + "positive_review_in_t5 = get_t5_vector(\"An Amazon review with a positive sentiment.\")\n", + "negative_review_in_t5 = get_t5_vector('An Amazon review with a negative sentiment.')\n", + "\n", + "def test_t5():\n", + " positive_example_in_t5 = get_t5_vector(positive_text)\n", + " negative_example_in_t5 = get_t5_vector(negative_text)\n", + "\n", + " def get_t5_score(sample_embedding):\n", + " return cosine_similarity(sample_embedding, positive_review_in_t5) - cosine_similarity(sample_embedding, negative_review_in_t5)\n", + "\n", + " positive_score = get_t5_score(positive_example_in_t5)\n", + " negative_score = get_t5_score(negative_example_in_t5)\n", + "\n", + " print(\"T5好评例子的评分 : %f\" % (positive_score))\n", + " print(\"T5差评例子的评分 : %f\" % (negative_score))\n", + "\n", + "test_t5()" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " negative 0.60 0.90 0.72 136\n", - " positive 0.98 0.90 0.94 789\n", - "\n", - " accuracy 0.90 925\n", - " macro avg 0.79 0.90 0.83 925\n", - "weighted avg 0.93 0.90 0.91 925\n", - "\n" - ] + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gxOy84BAHcPs", + "outputId": "6d2aafab-7da5-41b7-9969-38f87d97504a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T5好评例子的评分 : 0.010347\n", + "T5差评例子的评分 : -0.023935\n" + ] + } + ], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained('t5-base', model_max_length=512)\n", + "model = AutoModel.from_pretrained('t5-base')\n", + "\n", + "# set the model to evaluation mode\n", + "model.eval()\n", + "\n", + "# encode the input sentence\n", + "def get_t5_vector(line):\n", + " input_ids = tokenizer.encode(line, return_tensors='pt', max_length=512, truncation=True)\n", + " # generate the vector representation\n", + " with torch.no_grad():\n", + " outputs = model.encoder(input_ids=input_ids)\n", + " vector = outputs.last_hidden_state.mean(dim=1)\n", + " return vector[0]\n", + "\n", + "positive_review_in_t5 = get_t5_vector(\"An Amazon review with a positive sentiment.\")\n", + "negative_review_in_t5 = get_t5_vector('An Amazon review with a negative sentiment.')\n", + "\n", + "test_t5()" + ] }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ITMekdfcHcPs", + "outputId": "321fb40f-672d-4425-9f2b-f0ce4016bb20" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":12: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"sentiment\"] = df.Score.replace({1: \"negative\", 2: \"negative\", 4: \"positive\", 5: \"positive\"})\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.metrics import classification_report\n", + "\n", + "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", + "\n", + "df = pd.read_csv(datafile_path)\n", + "\n", + "\n", + "df[\"t5_embedding\"] = df.Text.apply(get_t5_vector)\n", + "# convert 5-star rating to binary sentiment\n", + "df = df[df.Score != 3]\n", + "df[\"sentiment\"] = df.Score.replace({1: \"negative\", 2: \"negative\", 4: \"positive\", 5: \"positive\"})" ] - }, - "metadata": {}, - "output_type": "display_data" + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 628 + }, + "id": "YDubfmifHcPs", + "outputId": "fadf2451-e52d-4da0-ac3b-e508927d050e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " negative 0.60 0.90 0.72 136\n", + " positive 0.98 0.90 0.94 789\n", + "\n", + " accuracy 0.90 925\n", + " macro avg 0.79 0.90 0.83 925\n", + "weighted avg 0.93 0.90 0.91 925\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import PrecisionRecallDisplay\n", + "\n", + "def evaluate_embeddings_approach():\n", + " def label_score(review_embedding):\n", + " return cosine_similarity(review_embedding, positive_review_in_t5) - cosine_similarity(review_embedding, negative_review_in_t5)\n", + "\n", + " probas = df[\"t5_embedding\"].apply(lambda x: label_score(x))\n", + " preds = probas.apply(lambda x: 'positive' if x>0 else 'negative')\n", + "\n", + " report = classification_report(df.sentiment, preds)\n", + " print(report)\n", + "\n", + " display = PrecisionRecallDisplay.from_predictions(df.sentiment, probas, pos_label='positive')\n", + " _ = display.ax_.set_title(\"2-class Precision-Recall curve\")\n", + "\n", + "evaluate_embeddings_approach()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rimAixIhTAWt" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "V100", + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "vscode": { + "interpreter": { + "hash": "8114e84f04cf14e493992e1b725447accf84073d5ec18e7063d492738bf032cb" + } } - ], - "source": [ - "from sklearn.metrics import PrecisionRecallDisplay\n", - "\n", - "def evaluate_embeddings_approach():\n", - " def label_score(review_embedding):\n", - " return cosine_similarity(review_embedding, positive_review_in_t5) - cosine_similarity(review_embedding, negative_review_in_t5)\n", - "\n", - " probas = df[\"t5_embedding\"].apply(lambda x: label_score(x))\n", - " preds = probas.apply(lambda x: 'positive' if x>0 else 'negative')\n", - "\n", - " report = classification_report(df.sentiment, preds)\n", - " print(report)\n", - "\n", - " display = PrecisionRecallDisplay.from_predictions(df.sentiment, probas, pos_label='positive')\n", - " _ = display.ax_.set_title(\"2-class Precision-Recall curve\")\n", - "\n", - "evaluate_embeddings_approach()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" }, - "vscode": { - "interpreter": { - "hash": "8114e84f04cf14e493992e1b725447accf84073d5ec18e7063d492738bf032cb" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/05_classification_in_ml.ipynb b/05_classification_in_ml.ipynb index de4a1e5..3b22063 100644 --- a/05_classification_in_ml.ipynb +++ b/05_classification_in_ml.ipynb @@ -9,50 +9,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2023-03-20 11:13:19-- https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/raw/master/toutiao_cat_data.txt.zip\n", - "Resolving github.com (github.com)... 192.30.255.112\n", - "Connecting to github.com (github.com)|192.30.255.112|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://raw.githubusercontent.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/master/toutiao_cat_data.txt.zip [following]\n", - "--2023-03-20 11:13:20-- https://raw.githubusercontent.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/master/toutiao_cat_data.txt.zip\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 26912069 (26M) [application/zip]\n", - "Saving to: ‘toutiao_cat_data.txt.zip’\n", - "\n", - "toutiao_cat_data.tx 100%[===================>] 25.67M 4.45MB/s in 10s \n", - "\n", - "2023-03-20 11:13:31 (2.51 MB/s) - ‘toutiao_cat_data.txt.zip’ saved [26912069/26912069]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "!wget https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/raw/master/toutiao_cat_data.txt.zip" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Archive: ./toutiao_cat_data.txt.zip\n", - " inflating: toutiao_cat_data.txt \n" - ] - } - ], + "outputs": [], "source": [ "!unzip ./toutiao_cat_data.txt.zip\n", "!mv ./toutiao_cat_data.txt data/" @@ -60,15 +28,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/qp/42zc2mfd2w37v06s36xltvz80000gn/T/ipykernel_72415/1171345427.py:16: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", - " df = pd.read_csv('data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])\n" + "/var/folders/lk/1jcy9d_j6258__3gv0rpj8h40000gn/T/ipykernel_3501/2567486315.py:16: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " df = pd.read_csv(os.environ.get(\"JUPYTER_HOME\") + '/data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])\n" ] }, { @@ -96,7 +64,7 @@ "\n", "\n", "# import data/toutiao_cat_data.txt as a pandas dataframe\n", - "df = pd.read_csv('data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])\n", + "df = pd.read_csv(os.environ.get(\"JUPYTER_HOME\") + '/data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])\n", "df = df.fillna(\"\")\n", "df[\"combined\"] = (\n", " \"标题: \" + df.title.str.strip() + \"; 关键字: \" + df.keywords.str.strip()\n", @@ -114,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -124,68 +92,103 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "import os\n", + "\n", + "client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])\n", + "\n", + "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", + "\n", + "def get_embedding(text, model=EMBEDDING_MODEL):\n", + " text = text.replace(\"\\n\", \" \")\n", + " return client.embeddings.create(input = [text], model=model).data[0].embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 请不要执行如下代码\n", - "# from openai.embeddings_utils import get_embedding\n", "\n", "# df_1k = df.sample(1000, random_state=42)\n", "\n", - "# df_1k[\"embedding\"] = df_1k.combined.apply(lambda x : get_embedding(x, engine=embedding_model))\n", + "# df_1k[\"embedding\"] = df_1k.combined.apply(lambda x : get_embedding(x, model=embedding_model))\n", "# df_1k.to_csv(\"data/toutiao_cat_data_10k_with_embeddings.csv\", index=False)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 请不需要执行如下代码\n", - "# @backoff.on_exception(backoff.expo, openai.error.RateLimitError)\n", + "# @backoff.on_exception(backoff.expo, openai.RateLimitError)\n", "# def get_embedding_with_backoff(**kwargs):\n", - "# return get_embedding(**kwargs)\n", - "\n", + "# return get_embedding(**kwargs)\n", + "#\n", "# df_10k = df.sample(10000, random_state=42)\n", - "\n", - "# df_10k[\"embedding\"] = df_10k.combined.apply(lambda x : get_embedding_with_backoff(text=x, engine=embedding_model))\n", + "#\n", + "# df_10k[\"embedding\"] = df_10k.combined.apply(lambda x : get_embedding_with_backoff(text=x, model=embedding_model))\n", "# df_10k.to_csv(\"data/toutiao_cat_data_10k_with_embeddings.csv\", index=False)" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Batch 0 Number of embeddings: 2000\n", + "Batch 0 Number of embeddings: 2000\n", + "Batch 0 Number of embeddings: 2000\n", + "Batch 0 Number of embeddings: 2000\n", + "Batch 0 Number of embeddings: 2000\n" + ] + } + ], "source": [ "# 如果你不想重新计算一遍embedding,请不要运行如下代码\n", - "# from openai.embeddings_utils import get_embeddings\n", "\n", - "# batch_size = 2000\n", + "batch_size = 2000\n", + "\n", + "def get_embeddings(list_of_text, model):\n", + " response = client.embeddings.create(input=list_of_text, model=model)\n", + " return [item.embedding for item in response.data]\n", "\n", - "# @backoff.on_exception(backoff.expo, openai.error.RateLimitError)\n", - "# def get_embeddings_with_backoff(prompts, engine):\n", - "# embeddings = []\n", - "# for i in range(0, len(prompts), batch_size):\n", - "# batch = prompts[i:i+batch_size]\n", - "# embeddings += get_embeddings(list_of_text=batch, engine=engine)\n", - "# return embeddings\n", + "@backoff.on_exception(backoff.expo, openai.RateLimitError)\n", + "def get_embeddings_with_backoff(prompts, model):\n", + " embeddings = []\n", + " for i in range(0, len(prompts), batch_size):\n", + " batch = prompts[i:i+batch_size]\n", + " batch_embeddings = get_embeddings(list_of_text=batch, model=model)\n", + " embeddings += batch_embeddings\n", + " print(f\"Batch {i} Number of embeddings: {len(embeddings)}\")\n", + " return embeddings\n", "\n", - "# # randomly sample 10k rows\n", - "# df_all = df\n", - "# # group prompts into batches of 100\n", - "# prompts = df_all.combined.tolist()\n", - "# prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]\n", + "# randomly sample 10k rows\n", + "# df_all = df.sample(10000, random_state=42)\n", + "df_all = df\n", + "# group prompts into batches of 100\n", + "prompts = df_all.combined.tolist()\n", + "prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]\n", "\n", - "# embeddings = []\n", - "# for batch in prompt_batches:\n", - "# batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)\n", - "# embeddings += batch_embeddings\n", + "embeddings = []\n", + "for batch in prompt_batches:\n", + " batch_embeddings = get_embeddings_with_backoff(prompts=batch, model=embedding_model)\n", + " embeddings += batch_embeddings\n", "\n", - "# df_all[\"embedding\"] = embeddings\n", - "# df_all.to_parquet(\"data/toutiao_cat_data_all_with_embeddings.parquet\", index=False)" + "df_all[\"embedding\"] = embeddings\n", + "df_all.to_parquet(os.environ.get(\"JUPYTER_HOME\") + \"/data/toutiao_cat_data_all_with_embeddings.parquet\", index=False)" ] }, { @@ -197,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -206,25 +209,25 @@ "text": [ " precision recall f1-score support\n", "\n", - " news_agriculture 0.83 0.85 0.84 495\n", - " news_car 0.88 0.94 0.91 895\n", - " news_culture 0.86 0.77 0.81 741\n", - " news_edu 0.86 0.89 0.87 708\n", + " news_agriculture 0.82 0.86 0.84 495\n", + " news_car 0.89 0.94 0.91 895\n", + " news_culture 0.87 0.76 0.81 741\n", + " news_edu 0.86 0.89 0.88 708\n", "news_entertainment 0.71 0.92 0.80 1051\n", - " news_finance 0.80 0.76 0.78 735\n", - " news_game 0.90 0.81 0.86 742\n", - " news_house 0.91 0.87 0.89 450\n", - " news_military 0.88 0.82 0.85 688\n", + " news_finance 0.81 0.76 0.78 735\n", + " news_game 0.91 0.83 0.87 742\n", + " news_house 0.91 0.86 0.89 450\n", + " news_military 0.89 0.83 0.86 688\n", " news_sports 0.90 0.92 0.91 968\n", - " news_story 0.94 0.47 0.62 197\n", - " news_tech 0.81 0.85 0.83 1052\n", + " news_story 0.94 0.46 0.61 197\n", + " news_tech 0.81 0.86 0.83 1052\n", " news_travel 0.80 0.75 0.77 599\n", " news_world 0.82 0.72 0.77 671\n", " stock 0.00 0.00 0.00 8\n", "\n", " accuracy 0.84 10000\n", - " macro avg 0.79 0.76 0.77 10000\n", - " weighted avg 0.84 0.84 0.83 10000\n", + " macro avg 0.80 0.76 0.77 10000\n", + " weighted avg 0.84 0.84 0.84 10000\n", "\n" ] }, @@ -232,11 +235,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/xuwenhao/miniconda3/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", - "/Users/xuwenhao/miniconda3/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", - "/Users/xuwenhao/miniconda3/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } @@ -246,7 +249,7 @@ "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report, accuracy_score\n", "\n", - "training_data = pd.read_parquet(\"data/toutiao_cat_data_all_with_embeddings.parquet\")\n", + "training_data = pd.read_parquet(os.environ.get(\"JUPYTER_HOME\") + \"/data/toutiao_cat_data_all_with_embeddings.parquet\")\n", "\n", "df = training_data.sample(50000, random_state=42)\n", "\n", @@ -265,14 +268,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/xuwenhao/miniconda3/envs/geektime/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/geektime/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", @@ -280,9 +283,7 @@ "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", - "/Users/xuwenhao/miniconda3/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "/Users/xuwenhao/miniconda3/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] }, @@ -294,7 +295,7 @@ "\n", " news_agriculture 0.85 0.88 0.87 3908\n", " news_car 0.92 0.92 0.92 7101\n", - " news_culture 0.82 0.84 0.83 5719\n", + " news_culture 0.82 0.85 0.83 5719\n", " news_edu 0.88 0.89 0.89 5376\n", "news_entertainment 0.85 0.88 0.86 7908\n", " news_finance 0.82 0.78 0.80 5409\n", @@ -303,7 +304,7 @@ " news_military 0.86 0.82 0.84 4976\n", " news_sports 0.93 0.93 0.93 7611\n", " news_story 0.83 0.81 0.82 1308\n", - " news_tech 0.84 0.85 0.85 8168\n", + " news_tech 0.84 0.86 0.85 8168\n", " news_travel 0.80 0.79 0.79 4252\n", " news_world 0.79 0.80 0.80 5370\n", " stock 0.00 0.00 0.00 70\n", @@ -318,7 +319,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/xuwenhao/miniconda3/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/geektime/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } @@ -365,7 +368,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.13" }, "vscode": { "interpreter": { diff --git a/README.md b/README.md index ce33949..1c2d846 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,29 @@ # Geektime AI Course -See https://time.geekbang.org/column/intro/100541001 +[简体中文](README_zh_CN.md) -Jupyter Notebooks for Geektime AI Course +## Preautions -For large data files, you could download through the following information: -链接: https://pan.baidu.com/s/1Cl0eFNLOkQqquf9ls0trEw 提取码: jvr4 +* I am revising the course and notebooks with current OpenAI API +* Old version notebook could be found under tag v0.0.1 + +## Using GPU + +* [Google Colab](https://colab.google) is recommended, could satisfy your CPU usage requiments for the course +* Or you could buy GPU resource through [AutoDL](https://autodl.com/home) + +## Course + +[Purchase Link](https://time.geekbang.org/column/intro/100541001) + +## Large data files + +[Baidu Disks](https://pan.baidu.com/s/1Cl0eFNLOkQqquf9ls0trEw) 提取码: jvr4 + + +## Environment setup + +For old versions ``` conda create -n geektime python=3.10 @@ -13,8 +31,8 @@ conda activate geektime conda env update --file conda-env.yml ``` - Or you could use pip as well ``` pip install -r requirements.txt ``` + diff --git a/README_zh_CN.md b/README_zh_CN.md new file mode 100644 index 0000000..f08e860 --- /dev/null +++ b/README_zh_CN.md @@ -0,0 +1,41 @@ +# 极客时间「AI大模型之美」 + +[English](README.md) + +## 注意事项 + +* 正在重新根据OpenAI的新版本API修订专栏和对应的Notebook +* 旧版本的Notebook可以在 v0.0.1 这个获取到 + +## 使用 GPU + +* 推荐使用/购买 [Google Colab](https://colab.google),可以满足主要的GPU使用需求 +* 或者购买 [AutoDL](https://autodl.com/home) + +## 专栏 + +[「AI大模型之美」购买链接](https://time.geekbang.org/column/intro/100541001) + +## 数据文件下载网盘地址 + +部分代码依赖一些大的数据文件或者模型文件,可以通过如下的百度网盘地址下载 + +[百度网盘](https://pan.baidu.com/s/1Cl0eFNLOkQqquf9ls0trEw) 提取码: jvr4 + + +## 环境搭建 + +旧版本的环境搭建方式如下 + +使用Conda +``` +conda create -n geektime python=3.10 +conda activate geektime +conda env update --file conda-env.yml +``` + +使用Pip +``` +pip install -r requirements.txt +``` + diff --git a/requirements-gpu.txt b/requirements-gpu.txt new file mode 100644 index 0000000..0d005c9 --- /dev/null +++ b/requirements-gpu.txt @@ -0,0 +1,4 @@ +transformers +pytorch +torchvision +sentencepiece \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5b2b8b6..c90d85e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,190 +1,11 @@ -aiofiles @ file:///C:/ci/aiofiles_1621287809877/work -aiohttp @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_690hlorvpy/croot/aiohttp_1670009554132/work -aiosignal @ file:///tmp/build/80754af9/aiosignal_1637843061372/work -altair @ file:///home/conda/feedstock_root/build_artifacts/altair_1675180856922/work -anyio @ file:///opt/concourse/worker/volumes/live/eb44598f-565b-45e7-4c50-c1ae70306e18/volume/anyio_1644481722202/work/dist -appdirs==1.4.4 -appnope @ file:///Users/builder/ci_310/appnope_1642500616005/work -argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work -argon2-cffi-bindings @ file:///opt/concourse/worker/volumes/live/cf502f86-3f51-4f85-686b-4867f6d672bd/volume/argon2-cffi-bindings_1644569704808/work -asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work -async-timeout @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_732x52axrd/croots/recipe/async-timeout_1664876366763/work -attrs @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_33k1uces4n/croot/attrs_1668696162258/work -Babel @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_59c7q3smap/croot/babel_1671781946809/work -backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work -backoff @ file:///home/conda/feedstock_root/build_artifacts/backoff_1665004764738/work -beautifulsoup4 @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_croot-cdiouih5/beautifulsoup4_1650462164803/work -bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work -Bottleneck @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_29949159-f86f-474b-bc1f-aaa1e0e222b4ofusifik/croots/recipe/bottleneck_1657175564045/work -brotlipy==0.7.0 -certifi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_477u68wvzm/croot/certifi_1671487773341/work/certifi -cffi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_1b0qzba5nr/croot/cffi_1670423213150/work -charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work -click @ file:///opt/concourse/worker/volumes/live/2d66025a-4d79-47c4-43be-6220928b6c82/volume/click_1646056610594/work -colorama @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_f5t80kwp9l/croot/colorama_1672386533201/work -comm @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_0b9r9i3b7k/croot/comm_1671231125581/work -contourpy @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_17gskqgptz/croots/recipe/contourpy_1663827415320/work -cpm-kernels==1.0.11 -cryptography @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19cvzxmeb9/croot/cryptography_1677533085498/work -cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work -daal4py==2023.0.2 -dataclasses-json==0.5.7 -debugpy @ file:///Users/builder/ci_310/debugpy_1642501698574/work -decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work -defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work -docker-pycreds @ file:///Users/ktietz/demo/mc3/conda-bld/docker-pycreds_1630654474270/work -entrypoints @ file:///opt/concourse/worker/volumes/live/5eb4850e-dcbc-41ad-5f22-922bac778f70/volume/entrypoints_1649926457041/work -et-xmlfile==1.1.0 -executing @ file:///opt/conda/conda-bld/executing_1646925071911/work -faiss==1.7.2 -fastapi @ file:///home/conda/feedstock_root/build_artifacts/fastapi_1679196090342/work -fastjsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b5c1gee32t/croots/recipe/python-fastjsonschema_1661368622875/work -ffmpy @ file:///home/conda/feedstock_root/build_artifacts/ffmpy_1659474992694/work -filelock @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_f29yrmlw9_/croot/filelock_1672387130651/work -flit_core @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9elg0qmmha/croot/flit-core_1679397106218/work/source/flit_core -fonttools==4.25.0 -frozenlist @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_5eiq5594pj/croot/frozenlist_1670004516635/work -fsspec @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d9zxascxg8/croot/fsspec_1679418997424/work -future @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_27i98bxita/croot/future_1677599886956/work -gensim @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_97o5eolfwi/croot/gensim_1674852444908/work -gitdb @ file:///tmp/build/80754af9/gitdb_1617117951232/work -GitPython @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_844theeehu/croot/gitpython_1674662784289/work -gradio @ file:///home/conda/feedstock_root/build_artifacts/gradio_1679252812835/work -greenlet==2.0.2 -h11 @ file:///tmp/build/80754af9/h11_1620423447028/work -h2 @ file:///Users/builder/ci_310/h2_1642510373093/work -hpack==4.0.0 -httpcore @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c54vcq42mp/croots/recipe/httpcore_1659344175052/work -httpx @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19ydeiz3u_/croots/recipe/httpx_1659460918370/work -huggingface-hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1679329705182/work -hyperframe==6.0.1 -icetk==0.0.7 -idna @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_00jf0h4zbt/croot/idna_1666125573348/work -importlib-metadata @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_81_20mq0d8/croot/importlib-metadata_1678997090664/work -iniconfig @ file:///home/linux1/recipes/ci/iniconfig_1610983019677/work -ipykernel @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_4dybncc18w/croot/ipykernel_1671488388285/work -ipython @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_5d1j6t_z43/croot/ipython_1676584167910/work -ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work -ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1671720089366/work -jedi @ file:///opt/concourse/worker/volumes/live/18b71546-5bde-4add-72d1-7d16b76f0f7a/volume/jedi_1644315243726/work -Jinja2 @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6adj7x0ejx/croot/jinja2_1666908137966/work -joblib @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_955tow_ysb/croot/joblib_1666298851241/work -json5 @ file:///tmp/build/80754af9/json5_1624432770122/work -jsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_21cqeq1xnk/croot/jsonschema_1676558686956/work -jupyter-server @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_031akrjssy/croot/jupyter_server_1671707631142/work -jupyter_client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_5e4bbpqn9e/croot/jupyter_client_1680171866753/work -jupyter_core @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b82fz_h369/croot/jupyter_core_1679906581737/work -jupyterlab @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a52a346vyc/croot/jupyterlab_1675354129045/work -jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work -jupyterlab-widgets @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9btffb27id/croot/jupyterlab_widgets_1679055288818/work -jupyterlab_server @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d7j_et_d0u/croot/jupyterlab_server_1679906304141/work -kiwisolver @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_e26jwrjf6j/croot/kiwisolver_1672387151391/work -langchain==0.0.129 -linkify-it-py @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_2aumoff133/croots/recipe/linkify-it-py_1659783367051/work -llama-index==0.5.4 -lxml @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_cccmd60j4n/croot/lxml_1679646460589/work -markdown-it-py @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_96i9k07xnz/croots/recipe/markdown-it-py_1659718717910/work -MarkupSafe @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d4a9444f-bd4c-4043-b47d-cede33979b0fve7bm42r/croots/recipe/markupsafe_1654597878200/work -marshmallow==3.19.0 -marshmallow-enum==1.5.1 -matplotlib @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_41fhwn4tj9/croot/matplotlib-suite_1679593479845/work -matplotlib-inline @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9ddl71oqte/croots/recipe/matplotlib-inline_1662014471815/work -mdit-py-plugins @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_58unnssimt/croots/recipe/mdit-py-plugins_1659721250303/work -mdurl @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_4dai2ev8x3/croots/recipe/mdurl_1659716031002/work -mistune @ file:///Users/builder/ci_310/mistune_1642534169737/work -multidict @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_81upqwhecp/croot/multidict_1665674236996/work -munkres==1.1.4 -mypy-extensions==1.0.0 -nbclassic @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9cfvk28pc0/croot/nbclassic_1680008782896/work -nbclient @ file:///opt/concourse/worker/volumes/live/7d38d6af-a5d4-4a2f-68ef-fc787e52a70c/volume/nbclient_1650308404062/work -nbconvert @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_8fyzuglni_/croot/nbconvert_1668450649428/work -nbformat @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_2daun1fill/croot/nbformat_1670352339504/work -nest-asyncio @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_64pfm74mxq/croot/nest-asyncio_1672387129786/work -notebook @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b6ev7dv2qh/croot/notebook_1680012009653/work -notebook_shim @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_e9s6zsmlb7/croot/notebook-shim_1668160584892/work -numexpr @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_cef3ah6r8w/croot/numexpr_1668713880672/work -numpy @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_8cgikbzvtd/croot/numpy_and_numpy_base_1672336184666/work -openai @ file:///home/conda/feedstock_root/build_artifacts/openai_1678675464569/work -openpyxl==3.0.10 -orjson @ file:///var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b9drze_kg4/croot/orjson_1668611820381/work/target/wheels/orjson-3.7.8-cp310-cp310-macosx_10_9_x86_64.whl -packaging @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c3zlhpboab/croot/packaging_1678965318579/work -pandas==1.5.3 -pandas-stubs @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9bmckx_i9z/croot/pandas-stubs_1676319803117/work -pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work -parso @ file:///opt/conda/conda-bld/parso_1641458642106/work -pathtools @ file:///Users/ktietz/demo/mc3/conda-bld/pathtools_1629713893697/work -pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work -pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work -Pillow==9.4.0 -platformdirs @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_7fs8_2xgrm/croots/recipe/platformdirs_1662711383474/work -plotly @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b7601a7c-071f-4424-ad46-f00f32c1766ccwgx0ywh/croots/recipe/plotly_1658160061089/work -pluggy @ file:///opt/concourse/worker/volumes/live/8277900c-164a-49c8-6f2a-f55c3c0154be/volume/pluggy_1648042581708/work -pooch @ file:///tmp/build/80754af9/pooch_1623324770023/work -prometheus-client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19kjbndib7/croots/recipe/prometheus_client_1659455105394/work -prompt-toolkit @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_82emz7mook/croot/prompt-toolkit_1672387300396/work -protobuf==3.18.3 -psutil @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c9b604bf-685f-47f6-8304-238e4e70557e1o7mmsot/croots/recipe/psutil_1656431274701/work -ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl -pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work -py @ file:///opt/conda/conda-bld/py_1644396412707/work -pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work -pydantic @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_90fvnva4_f/croot/pydantic_1667416137634/work -pydub @ file:///home/conda/feedstock_root/build_artifacts/pydub_1615612442567/work -Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work -pyOpenSSL @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6dweji2whw/croot/pyopenssl_1677607689781/work -pyparsing @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_3a17y2delq/croots/recipe/pyparsing_1661452538853/work -pyrsistent @ file:///Users/builder/ci_310/pyrsistent_1642541562041/work -PySocks @ file:///Users/builder/ci_310/pysocks_1642536366386/work -pytest==7.1.2 -python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work -python-multipart @ file:///home/conda/feedstock_root/build_artifacts/python-multipart_1679167423335/work -pytz @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_ddzpsmm2_f/croot/pytz_1671697430473/work -PyYAML @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_79xo15pf1i/croot/pyyaml_1670514753622/work -pyzmq @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_15f7a459-ad98-422b-b8da-cbf1f626e2115nt0ocwy/croots/recipe/pyzmq_1657724193704/work -regex @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_978efa21-43f5-4f43-9f25-acd8393817ddce2xcjaf/croots/recipe/regex_1658257186496/work -requests @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b9lic17gbj/croot/requests_1678709739389/work -rfc3986 @ file:///Users/ktietz/demo/mc3/conda-bld/rfc3986_1629478296451/work -scikit-learn @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a65wfgtmmc/croot/scikit-learn_1676911655056/work -scikit-learn-intelex==20230131.200242 -scipy==1.10.0 -semantic-version @ file:///tmp/build/80754af9/semantic_version_1613321057691/work -Send2Trash @ file:///tmp/build/80754af9/send2trash_1632406701022/work -sentencepiece==0.1.95 -sentry-sdk @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_7eq20obptv/croots/recipe/sentry-sdk_1659784546221/work -setproctitle @ file:///Users/builder/ci_310/setproctitle_1642537446063/work -six @ file:///tmp/build/80754af9/six_1644875935023/work -smart-open @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_27148096-9ddc-448c-830e-fb4829d46f5dwl2am402/croots/recipe/smart_open_1651563554983/work -smmap @ file:///tmp/build/80754af9/smmap_1611694433573/work -sniffio @ file:///Users/builder/ci_310/sniffio_1642537651147/work -soupsieve @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_14fb2zs6e3/croot/soupsieve_1666296397588/work -SQLAlchemy==1.4.47 -stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work -starlette @ file:///home/conda/feedstock_root/build_artifacts/starlette-recipe_1678817698143/work -tenacity==8.2.2 -terminado @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_18_p3gbeio/croot/terminado_1671751835656/work -threadpoolctl @ file:///Users/ktietz/demo/mc3/conda-bld/threadpoolctl_1629802263681/work -tiktoken @ file:///Users/runner/miniforge3/conda-bld/tiktoken_1679079880442/work -tinycss2 @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_56dshjmms6/croot/tinycss2_1668168824483/work -tokenizers==0.13.2 -tomli @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_90762ba4-f339-47e8-bd29-416854a59b233d27hku_/croots/recipe/tomli_1657175507767/work -toolz @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a7gkswah88/croot/toolz_1667464082910/work -torch==1.12.1 -torchvision @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b64c8kdd72/croot/torchvision_1670313553427/work -tornado @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_1fimz6o0gc/croots/recipe/tornado_1662061695695/work -tqdm @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_03acd6t6ca/croot/tqdm_1679561866522/work -traitlets @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_0dtilxc0bw/croot/traitlets_1671143889152/work -transformers==4.27.4 -types-pytz @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_37mesw_xfu/croot/types-pytz_1665514246317/work -typing-inspect==0.8.0 -typing_extensions @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_4b7xacf029/croot/typing_extensions_1669923792404/work -uc-micro-py @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a756qk1wsv/croots/recipe/uc-micro-py_1659769413242/work -urllib3 @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_61c6ppfd7m/croot/urllib3_1680254700568/work -uvicorn @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d80hhx7s7h/croot/uvicorn-split_1678090080474/work -wandb @ file:///home/conda/feedstock_root/build_artifacts/wandb_1678843302744/work -wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work -webencodings==0.5.1 -websocket-client @ file:///Users/builder/ci_310/websocket-client_1642513572726/work -websockets @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c5bbxxm8rk/croot/websockets_1678966795781/work -widgetsnbextension @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_8feupjw2ld/croot/widgetsnbextension_1679313870305/work -yarl @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d8a27nidjc/croots/recipe/yarl_1661437080982/work -zipp @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b71z79bye2/croot/zipp_1672387125902/work +backoff==2.2.1 +ipywidgets==8.1.1 +jupyterlab==4.0.9 +langchain==0.0.352 +llama-index==0.9.21 +matplotlib==3.8.2 +openai==1.6.1 +pandas==2.1.4 +pyarrow==14.0.2 +pytest==7.4.3 +scikit-learn==1.3.2 \ No newline at end of file