From 1c827eba4866df9a4151797cf087bd19b0d70127 Mon Sep 17 00:00:00 2001 From: johnny9210 Date: Tue, 6 May 2025 15:21:37 +0900 Subject: [PATCH 1/3] [N-2] 11-Vector Store / 11 - Faiss --- 09-VectorStore/11-Faiss.ipynb | 818 ++++++++++++++++++++++++++++++++++ 09-VectorStore/utils/faiss.py | 260 +++++++++++ 2 files changed, 1078 insertions(+) create mode 100644 09-VectorStore/11-Faiss.ipynb create mode 100644 09-VectorStore/utils/faiss.py diff --git a/09-VectorStore/11-Faiss.ipynb b/09-VectorStore/11-Faiss.ipynb new file mode 100644 index 000000000..422dfdb63 --- /dev/null +++ b/09-VectorStore/11-Faiss.ipynb @@ -0,0 +1,818 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25733da0", + "metadata": {}, + "source": [ + "# Faiss\n", + "\n", + "- Author: [Ilgyun Jeong](https://github.com/johnny9210)\n", + "- Design: \n", + "- Peer Review: \n", + "- This is a part of [LangChain Open Tutorial](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial)\n", + "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb) [![Open in GitHub](https://img.shields.io/badge/Open%20in%20GitHub-181717?style=flat-square&logo=github&logoColor=white)](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb)\n", + "\n", + "## Overview\n", + "\n", + "This tutorial covers how to use ```Faiss``` with **LangChain** .\n", + "\n", + "```Faiss``` (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also includes supporting code for evaluation and parameter tuning.\n", + "\n", + "This tutorial walks you through using **CRUD** operations with the ```Faiss``` **storing** , **updating** , **deleting** documents, and performing **similarity-based retrieval** .\n", + "\n", + "### Table of Contents\n", + "\n", + "- [Overview](#overview)\n", + "- [Environment Setup](#environment-setup)\n", + "- [What is Faiss?](#what-is-Faiss?)\n", + "- [Data](#data)\n", + "- [Initial Setting Faiss](#initial-setting-Faiss)\n", + "- [Document Manager](#document-manager)\n", + "\n", + "\n", + "### References\n", + "\n", + "- [Faiss](https://engineering.fb.com/2017/03/29/data-infrastructure/Faiss-a-library-for-efficient-similarity-search/)\n", + "- [Faiss Library paper](https://arxiv.org/pdf/2401.08281)\n", + "- [Faiss documentation](https://Faiss.ai/)\n", + "- [Langchain Faiss document](https://python.langchain.com/docs/integrations/vectorstores/Faiss/)\n", + "----" + ] + }, + { + "cell_type": "markdown", + "id": "c1fac085", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "Set up the environment. You may refer to [Environment Setup](https://wikidocs.net/257836) for more details.\n", + "\n", + "**[Note]**\n", + "- ```langchain-opentutorial``` is a package that provides a set of easy-to-use environment setup, useful functions and utilities for tutorials. \n", + "- You can checkout the [```langchain-opentutorial```](https://github.com/LangChain-OpenTutorial/langchain-opentutorial-pypi) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "98da7994", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture --no-stderr\n", + "%pip install langchain-opentutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "69133474", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Install necessary package\n", + "%pip install -qU langchain-community Faiss-cpu\n", + "\n", + "# Note that you can also install Faiss-gpu if you want to use the GPU enabled version\n", + "# Install necessary package\n", + "# %pip install -qU langchain-community Faiss-gpu" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "800c732b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "# Install required packages\n", + "from langchain_opentutorial import package\n", + "\n", + "package.install(\n", + " [\n", + " \"langsmith\",\n", + " \"langchain\",\n", + " \"langchain_core\",\n", + " \"langchain_community\",\n", + " \"faiss-cpu\",\n", + " ],\n", + " verbose=False,\n", + " upgrade=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5b36bafa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Environment variables have been set successfully.\n" + ] + } + ], + "source": [ + "# Set environment variables\n", + "from langchain_opentutorial import set_env\n", + "\n", + "set_env(\n", + " {\n", + " \"OPENAI_API_KEY\": \"Your OPENAI API KEY\",\n", + " \"LANGCHAIN_API_KEY\": \"Your LangChain API KEY\",\n", + " \"LANGCHAIN_TRACING_V2\": \"true\",\n", + " \"LANGCHAIN_ENDPOINT\": \"https://api.smith.langchain.com\",\n", + " \"LANGCHAIN_PROJECT\": \"Faiss\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8011a0c7", + "metadata": {}, + "source": [ + "You can alternatively set API keys such as ```OPENAI_API_KEY``` in a ```.env``` file and load them.\n", + "\n", + "[Note] This is not necessary if you've already set the required API keys in previous steps." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "70d7e764", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(override=True)" + ] + }, + { + "cell_type": "markdown", + "id": "915cf401", + "metadata": {}, + "source": [ + "## What is Faiss?\n", + "\n", + "```Faiss``` (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors.\n", + "\n", + "* Core Concepts\n", + " * ```Similarity search```: Finding vectors that are closest to a query vector\n", + " * ```Scaling```: Handles vector sets of any size, including those exceeding RAM\n", + " * ```Efficiency```: Optimized for memory usage and search speed\n", + "\n", + "* Vector Operations\n", + " * ```Nearest neighbor```: Finding k vectors closest to a query vector\n", + " * ```Maximum inner product```: Finding vectors with highest dot product\n", + " * ```Clustering```: Grouping similar vectors together\n", + "\n", + "* Index Types\n", + " * ```Flat```: Exact search with exhaustive comparison\n", + " * ```IVF```: Inverted file structure for faster approximate search\n", + " * ```HNSW```: Hierarchical navigable small world graphs for high-quality search\n", + " * ```PQ```: Product quantization for memory compression\n", + " * ```OPQ```: Optimized product quantization for better accuracy\n", + "\n", + "* Performance Metrics\n", + " * ```Speed```: Query time for finding similar vectors\n", + " * ```Memory```: RAM requirements for index storage\n", + " * ```Accuracy```: How well results match exhaustive search (recall)\n", + "\n", + "* Technical Features\n", + " * ```GPU support```: State-of-the-art GPU implementations with 5-20x speedup\n", + " * ```Multi-threading```: Parallel processing across CPU cores\n", + " * ```SIMD optimization```: Vectorized operations for faster computation\n", + " * ```Half-precision```: Float16 support for better performance\n", + "\n", + "* Applications\n", + " * ```Image similarity```: Finding visually similar images\n", + " * ```Text embeddings```: Semantic search in document collections\n", + " * ```Recommendation systems```: Finding similar items for users\n", + " * ```Classification```: Computing maximum inner-products for classification" + ] + }, + { + "cell_type": "markdown", + "id": "6f3b5bd2", + "metadata": {}, + "source": [ + "## Data\n", + "\n", + "This part walks you through the **data preparation process** .\n", + "\n", + "This section includes the following components:\n", + "\n", + "- Introduce Data\n", + "\n", + "- Preprocessing Data\n" + ] + }, + { + "cell_type": "markdown", + "id": "508ae7f7", + "metadata": {}, + "source": [ + "### Introduce Data\n", + "\n", + "In this tutorial, we will use the fairy tale **๐Ÿ“— The Little Prince** in PDF format as our data.\n", + "\n", + "This material complies with the **Apache 2.0 license** .\n", + "\n", + "The data is used in a text (.txt) format converted from the original PDF.\n", + "\n", + "You can view the data at the link below.\n", + "- [Data Link](https://huggingface.co/datasets/sohyunwriter/the_little_prince)" + ] + }, + { + "cell_type": "markdown", + "id": "004ea4f4", + "metadata": {}, + "source": [ + "### Preprocessing Data\n", + "\n", + "In this tutorial section, we will preprocess the text data from The Little Prince and convert it into a list of ```LangChain Document``` objects with metadata. \n", + "\n", + "Each document chunk will include a ```title``` field in the metadata, extracted from the first line of each section." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8e4cac64", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema import Document\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "import re\n", + "from typing import List\n", + "\n", + "\n", + "def preprocessing_data(content: str) -> List[Document]:\n", + " # 1. Split the text by double newlines to separate sections\n", + " blocks = content.split(\"\\n\\n\")\n", + "\n", + " # 2. Initialize the text splitter\n", + " text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=500, # Maximum number of characters per chunk\n", + " chunk_overlap=50, # Overlap between chunks to preserve context\n", + " separators=[\"\\n\\n\", \"\\n\", \" \"], # Order of priority for splitting\n", + " )\n", + "\n", + " documents = []\n", + "\n", + " # 3. Loop through each section\n", + " for block in blocks:\n", + " lines = block.strip().splitlines()\n", + " if not lines:\n", + " continue\n", + "\n", + " # Extract title from the first line using square brackets [ ]\n", + " first_line = lines[0]\n", + " title_match = re.search(r\"\\[(.*?)\\]\", first_line)\n", + " title = title_match.group(1).strip() if title_match else None\n", + "\n", + " # Remove the title line from content\n", + " body = \"\\n\".join(lines[1:]).strip()\n", + " if not body:\n", + " continue\n", + "\n", + " # 4. Chunk the section using the text splitter\n", + " chunks = text_splitter.split_text(body)\n", + "\n", + " # 5. Create a LangChain Document for each chunk with the same title metadata\n", + " for chunk in chunks:\n", + " documents.append(Document(page_content=chunk, metadata={\"title\": title}))\n", + "\n", + " print(f\"Generated {len(documents)} chunked documents.\")\n", + "\n", + " return documents" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1d091a51", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 262 chunked documents.\n" + ] + } + ], + "source": [ + "# Load the entire text file\n", + "with open(\"./data/the_little_prince.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " content = f.read()\n", + "\n", + "# Preprocessing Data\n", + "\n", + "docs = preprocessing_data(content=content)" + ] + }, + { + "cell_type": "markdown", + "id": "1977d4ff", + "metadata": {}, + "source": [ + "## Initial Setting Faiss\n", + "\n", + "This part walks you through the initial setup of ```Faiss``` .\n", + "\n", + "This section includes the following components:\n", + "\n", + "- Load Embedding Model\n", + "\n", + "- Load ```Faiss``` Client" + ] + }, + { + "cell_type": "markdown", + "id": "7eee56b2", + "metadata": {}, + "source": [ + "### Load Embedding Model\n", + "\n", + "In the **Load Embedding Model** section, you'll learn how to load an embedding model.\n", + "\n", + "This tutorial uses **OpenAI's** **API-Key** for loading the model.\n", + "\n", + "*๐Ÿ’ก If you prefer to use another embedding model, see the instructions below.*\n", + "- [Embedding Models](https://python.langchain.com/docs/integrations/text_embedding/)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5bd5c3c9", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embedding = OpenAIEmbeddings(model=\"text-embedding-3-large\")" + ] + }, + { + "cell_type": "markdown", + "id": "40f65795", + "metadata": {}, + "source": [ + "### Load Faiss Client\n", + "\n", + "In the **Load ```Faiss``` Client** section, we cover how to load the **database client object** using the **Python SDK** for ```Faiss``` .\n", + "- [Faiss Python SDK Docs](https://github.com/facebookresearch/faiss/wiki/getting-started?utm_source=chatgpt.com)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "eed0ebad", + "metadata": {}, + "outputs": [], + "source": [ + "# Create Database Client Object Function\n", + "import faiss\n", + "import numpy as np\n", + "\n", + "\n", + "def get_db_client(dim: int = 128):\n", + " \"\"\"\n", + "\n", + " Initializes and returns a VectorStore client instance.\n", + "\n", + "\n", + " This function loads configuration (e.g., API key, host) from environment\n", + "\n", + " variables or default values and creates a client object to interact\n", + "\n", + " with the faiss Python SDK.\n", + "\n", + "\n", + " Returns:\n", + "\n", + " client:ClientType - An instance of the faiss client.\n", + "\n", + "\n", + " Raises:\n", + "\n", + " ValueError: If required configuration is missing.\n", + "\n", + " \"\"\"\n", + "\n", + " base_index = faiss.IndexFlatL2(dim) # L2 ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ˜ ์ธ๋ฑ์Šค ์ƒ์„ฑ\n", + " client = faiss.IndexIDMap(base_index) # ID ๋งคํ•‘ ์ง€์› ์ถ”๊ฐ€\n", + "\n", + " return client" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2b5f4116", + "metadata": {}, + "outputs": [], + "source": [ + "# Get DB Client Object\n", + "client = get_db_client()" + ] + }, + { + "cell_type": "markdown", + "id": "3a5a97a0", + "metadata": {}, + "source": [ + "## Document Manager\n", + "\n", + "To support the **Langchain-Opentutorial** , we implemented a custom set of **CRUD** functionalities for VectorDBs. \n", + "\n", + "The following operations are included:\n", + "\n", + "- ```upsert``` : Update existing documents or insert if they donโ€™t exist\n", + "\n", + "- ```upsert_parallel``` : Perform upserts in parallel for large-scale data\n", + "\n", + "- ```similarity_search``` : Search for similar documents based on embeddings\n", + "\n", + "- ```delete``` : Remove documents based on filter conditions\n", + "\n", + "Each of these features is implemented as class methods specific to each VectorDB.\n", + "\n", + "In this tutorial, you can easily utilize these methods to interact with your VectorDB.\n", + "\n", + "*We plan to continuously expand the functionality by adding more common operations in the future.*" + ] + }, + { + "cell_type": "markdown", + "id": "65a40601", + "metadata": {}, + "source": [ + "### Create Instance\n", + "\n", + "First, we create an instance of the ```faiss``` helper class to use its CRUD functionalities.\n", + "\n", + "This class is initialized with the **```faiss``` Python SDK client instance**, **index name** and the **embedding model instance** , both of which were defined in the previous section." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dccab807", + "metadata": {}, + "outputs": [], + "source": [ + "# import FaissCRUDManager\n", + "from utils.faiss import FaissCRUDManager\n", + "\n", + "# connect to tutorial_index\n", + "crud_manager = FaissCRUDManager(dim=3072, embedding=embedding)" + ] + }, + { + "cell_type": "markdown", + "id": "5859c412", + "metadata": {}, + "source": [ + "Now you can use the following **CRUD** operations with the ```crud_manager``` instance.\n", + "\n", + "These instance allow you to easily manage documents in your ```faiss``` ." + ] + }, + { + "cell_type": "markdown", + "id": "7c6c53c5", + "metadata": {}, + "source": [ + "### Upsert Document\n", + "\n", + "**Update** existing documents or **insert** if they donโ€™t exist\n", + "\n", + "**โœ… Args**\n", + "\n", + "- ```texts``` : Iterable[str] โ€“ List of text contents to be inserted/updated.\n", + "\n", + "- ```metadatas``` : Optional[List[Dict]] โ€“ List of metadata dictionaries for each text (optional).\n", + "\n", + "- ```ids``` : Optional[List[str]] โ€“ Custom IDs for the documents. If not provided, IDs will be auto-generated.\n", + "\n", + "- ```**kwargs``` : Extra arguments for the underlying vector store.\n", + "\n", + "**๐Ÿ”„ Return**\n", + "\n", + "- None" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f3a6c32b", + "metadata": {}, + "outputs": [], + "source": [ + "from uuid import uuid4\n", + "\n", + "# Create ID for each document\n", + "ids = [str(uuid4()) for _ in docs]\n", + "\n", + "args = {\n", + " \"texts\": [doc.page_content for doc in docs[:2]],\n", + " \"metadatas\": [doc.metadata for doc in docs[:2]],\n", + " \"ids\": ids[:2],\n", + " # if you want args, add params.\n", + "}\n", + "\n", + "crud_manager.upsert(**args)" + ] + }, + { + "cell_type": "markdown", + "id": "278fe1ed", + "metadata": {}, + "source": [ + "### Upsert Parallel Document\n", + "\n", + "Perform **upserts** in **parallel** for large-scale data\n", + "\n", + "**โœ… Args**\n", + "\n", + "- ```texts``` : Iterable[str] โ€“ List of text contents to be inserted/updated.\n", + "\n", + "- ```metadatas``` : Optional[List[Dict]] โ€“ List of metadata dictionaries for each text (optional).\n", + "\n", + "- ```ids``` : Optional[List[str]] โ€“ Custom IDs for the documents. If not provided, IDs will be auto-generated.\n", + "\n", + "- ```batch_size``` : int โ€“ Number of documents per batch (default: 32).\n", + "\n", + "- ```workers``` : int โ€“ Number of parallel workers (default: 10).\n", + "\n", + "- ```**kwargs``` : Extra arguments for the underlying vector store.\n", + "\n", + "**๐Ÿ”„ Return**\n", + "\n", + "- None" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a89dd8e0", + "metadata": {}, + "outputs": [], + "source": [ + "from uuid import uuid4\n", + "\n", + "args = {\n", + " \"texts\": [doc.page_content for doc in docs],\n", + " \"metadatas\": [doc.metadata for doc in docs],\n", + " \"ids\": ids,\n", + " # if you want args, add params.\n", + "}\n", + "\n", + "crud_manager.upsert_parallel(**args)" + ] + }, + { + "cell_type": "markdown", + "id": "6beea197", + "metadata": {}, + "source": [ + "### Similarity Search\n", + "\n", + "Search for **similar documents** based on **embeddings** .\n", + "\n", + "This method uses **\"cosine similarity\"** .\n", + "\n", + "\n", + "**โœ… Args**\n", + "\n", + "- ```query``` : str โ€“ The text query for similarity search.\n", + "\n", + "- ```k``` : int โ€“ Number of top results to return (default: 10).\n", + "\n", + "```**kwargs``` : Additional search options (e.g., filters).\n", + "\n", + "**๐Ÿ”„ Return**\n", + "\n", + "- ```results``` : List[Document] โ€“ A list of LangChain Document objects ranked by similarity." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5859782b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank 1\n", + "Contents : And he went back to meet the fox. \n", + "\"Goodbye,\" he said. \n", + "\"Goodbye,\" said the fox. \"And now here is my secret, a very simple secret: It is only with the heart that one can see rightly; what is essential is invisible to the eye.\" \n", + "\"What is essential is invisible to the eye,\" the little prince repeated, so that he would be sure to remember.\n", + "\"It is the time you have wasted for your rose that makes your rose so important.\"\n", + "Metadata: {'id': 'f3377018-0d2c-45b0-baec-75eb071ef5f5', 'title': 'Chapter 21'}\n", + "Similarity Score: 0.504\n", + "\n", + "Rank 2\n", + "Contents : \"Yes,\" I said to the little prince. \"The house, the stars, the desert-- what gives them their beauty is something that is invisible!\" \n", + "\"I am glad,\" he said, \"that you agree with my fox.\"\n", + "Metadata: {'id': '88bc73d6-a7f5-424c-9a6d-6aa496b5254a', 'title': 'Chapter 24'}\n", + "Similarity Score: 0.498\n", + "\n", + "Rank 3\n", + "Contents : \"The men where you live,\" said the little prince, \"raise five thousand roses in the same garden-- and they do not find in it what they are looking for.\" \n", + "\"They do not find it,\" I replied. \n", + "\"And yet what they are looking for could be found in one single rose, or in a little water.\" \n", + "\"Yes, that is true,\" I said. \n", + "And the little prince added: \n", + "\"But the eyes are blind. One must look with the heart...\"\n", + "Metadata: {'id': 'e616d38d-b757-487d-9a5b-b164a69efb15', 'title': 'Chapter 25'}\n", + "Similarity Score: 0.464\n", + "\n" + ] + } + ], + "source": [ + "# Search by Query\n", + "\n", + "results = crud_manager.search(query=\"What is essential is invisible to the eye.\", k=3)\n", + "for idx, result in enumerate(results):\n", + " print(f\"Rank {idx+1}\")\n", + " print(f\"Contents : {result['text']}\")\n", + " print(f\"Metadata: {result['metadata']}\")\n", + " print(f\"Similarity Score: {result['score']}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "9ad0ed0c", + "metadata": {}, + "source": [ + "### Delete Document\n", + "\n", + "Remove documents based on filter conditions\n", + "\n", + "**โœ… Args**\n", + "\n", + "- ```ids``` : Optional[List[str]] โ€“ List of document IDs to delete. If None, deletion is based on filter.\n", + "\n", + "- ```filters``` : Optional[Dict] โ€“ Dictionary specifying filter conditions (e.g., metadata match).\n", + "\n", + "- ```**kwargs``` : Any additional parameters.\n", + "\n", + "**๐Ÿ”„ Return**\n", + "\n", + "- Boolean" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0e3a2c33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Delete by ids\n", + "\n", + "ids = ids[:5] # The 'ids' value you want to delete\n", + "crud_manager.delete(ids=ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "60bcb4cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Delete by ids with filters\n", + "\n", + "filters = {\"page\": 6}\n", + "crud_manager.delete(filters={\"title\": \"chapter 6\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "30d42d2e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Delete All\n", + "\n", + "crud_manager.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4663706b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/09-VectorStore/utils/faiss.py b/09-VectorStore/utils/faiss.py new file mode 100644 index 000000000..5efd50fbe --- /dev/null +++ b/09-VectorStore/utils/faiss.py @@ -0,0 +1,260 @@ +from utils.vectordbinterface import DocumentManager +from utils.vectordbinterface import Iterable, Any, Optional, List, Dict +from langchain_core.documents import Document +import faiss +import numpy as np +from uuid import uuid4 +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Any, Iterable + + +class FaissCRUDManager(DocumentManager): + def __init__( + self, dim: int = 768, embedding: Optional[Any] = None, **kwargs + ) -> None: + """ + FAISS ๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ๋งค๋‹ˆ์ € ์ดˆ๊ธฐํ™” + + Args: + dim: ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ์˜ ์ฐจ์› + embedding: ์„ ํƒ์  ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜ + **kwargs: ์ถ”๊ฐ€ ์ธ์ž๋“ค + """ + super().__init__() + self.dim = dim + self.embedding = embedding + + # FAISS ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™” - IndexIDMap์œผ๋กœ ๊ฐ์‹ธ์„œ ID ์ง€์› ์ถ”๊ฐ€ + base_index = faiss.IndexFlatL2(dim) # L2 ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ˜ ์ธ๋ฑ์Šค ์ƒ์„ฑ + self.index = faiss.IndexIDMap(base_index) # ID ๋งคํ•‘ ์ง€์› ์ถ”๊ฐ€ + + # ID์™€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  ๋”•์…”๋„ˆ๋ฆฌ + self.document_store = {} + self.next_id = 0 # ๋‚ด๋ถ€ ID ์นด์šดํ„ฐ + + def upsert( + self, + texts: Iterable[str], + metadatas: Optional[List[Dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> None: + """ + ํ…์ŠคํŠธ๋ฅผ ์ž„๋ฒ ๋”ฉํ•˜๊ณ  FAISS ์ธ๋ฑ์Šค์— ์ถ”๊ฐ€ + + Args: + texts: ๋ฌธ์„œ ๋˜๋Š” ํ…์ŠคํŠธ + metadatas: ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ + ids: ๊ณ ์œ  ID, None์ด๋ฉด ์ž๋™ ์ƒ์„ฑ + **kwargs: ์ถ”๊ฐ€ ๋งค๊ฐœ๋ณ€์ˆ˜ + """ + texts_list = list(texts) + + if ids is None: + ids = [str(uuid4()) for _ in range(len(texts_list))] + + if metadatas is None: + metadatas = [{} for _ in range(len(texts_list))] + + # ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ + if self.embedding: + embeddings = self.embedding.embed_documents(texts_list) + else: + # ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ (์˜ˆ์‹œ๋กœ ๋žœ๋ค ๋ฒกํ„ฐ ์ƒ์„ฑ) + embeddings = [np.random.rand(self.dim).astype('float32') for _ in texts_list] + + # FAISS ์ธ๋ฑ์Šค์— ๋ฒกํ„ฐ์™€ ID ์ถ”๊ฐ€ + vectors = np.array(embeddings).astype('float32') + + # ID๋ฅผ ์ •์ˆ˜๋กœ ๋ณ€ํ™˜ (FAISS๋Š” ์ •์ˆ˜ ID๋งŒ ์ง€์›) + int_ids = np.array([i + self.next_id for i in range(len(texts_list))], dtype=np.int64) + + # ๋ฒกํ„ฐ์™€ ID ํ•จ๊ป˜ ์ถ”๊ฐ€ + self.index.add_with_ids(vectors, int_ids) + + # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์™€ ๋ฌธ์„œ ์ €์žฅ + for i, (text, metadata, user_id) in enumerate(zip(texts_list, metadatas, ids)): + # ํ˜„์žฌ ์ธ๋ฑ์Šค๋Š” self.next_id + i + idx = self.next_id + i + + self.document_store[user_id] = { + 'index': idx, # FAISS ์ธ๋ฑ์Šค ์ €์žฅ + 'text': text, + 'metadata': metadata + } + + self.next_id += len(texts_list) + + def upsert_parallel( + self, + texts: Iterable[str], + metadatas: Optional[List[Dict]] = None, + ids: Optional[List[str]] = None, + batch_size: int = 32, + workers: int = 10, + **kwargs: Any, + ) -> None: + """ + ํ…์ŠคํŠธ๋ฅผ ๋ณ‘๋ ฌ๋กœ ์ฒ˜๋ฆฌํ•˜์—ฌ ์ž„๋ฒ ๋”ฉํ•˜๊ณ  FAISS ์ธ๋ฑ์Šค์— ์ถ”๊ฐ€ + + Args: + texts: ๋ฌธ์„œ ๋˜๋Š” ํ…์ŠคํŠธ + metadatas: ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ + ids: ๊ณ ์œ  ID, None์ด๋ฉด ์ž๋™ ์ƒ์„ฑ + batch_size: ๋ฐฐ์น˜ ํฌ๊ธฐ + workers: ์ž‘์—…์ž ์ˆ˜ + **kwargs: ์ถ”๊ฐ€ ๋งค๊ฐœ๋ณ€์ˆ˜ + """ + # ๋ฐฐ์น˜ ์ƒ์„ฑ + texts_list = list(texts) + total = len(texts_list) + + if ids is None: + ids = [str(uuid4()) for _ in range(total)] + + if metadatas is None: + metadatas = [{} for _ in range(total)] + + batches = [ + ( + texts_list[i : i + batch_size], + metadatas[i : i + batch_size] if metadatas else None, + ids[i : i + batch_size] if ids else None, + ) + for i in range(0, total, batch_size) + ] + + # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ + with ThreadPoolExecutor(max_workers=workers) as executor: + list(executor.map(lambda batch: self.upsert(*batch, **kwargs), batches)) + + def search(self, query: str, k: int = 10, **kwargs: Any) -> List[Dict]: + """ + ์ฟผ๋ฆฌ์™€ ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฌธ์„œ ๊ฒ€์ƒ‰ + + Args: + query: ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ + k: ๋ฐ˜ํ™˜ํ•  ๊ฒฐ๊ณผ ์ˆ˜ + **kwargs: ํ•„ํ„ฐ๋ง ์˜ต์…˜ + + Returns: + ์œ ์‚ฌํ•œ ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ + """ + if not self.document_store: # ๋ฌธ์„œ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ + return [] + + # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ + if self.embedding: + query_embedding = self.embedding.embed_documents([query])[0] + else: + # ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ (์˜ˆ์‹œ๋กœ ๋žœ๋ค ๋ฒกํ„ฐ ์ƒ์„ฑ) + query_embedding = np.random.rand(self.dim).astype('float32') + + # ๋ฒกํ„ฐ๋ฅผ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜ + query_vector = np.array([query_embedding]).astype('float32') + + # ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ - IndexIDMap์˜ ๊ฒฝ์šฐ ๊ฑฐ๋ฆฌ์™€ ํ•จ๊ป˜ ์‹ค์ œ ID ๋ฐ˜ํ™˜ + distances, indices = self.index.search(query_vector, k) + + # ๊ฒฐ๊ณผ ๋ณ€ํ™˜ + results = [] + + # ์—ญ๋ฐฉํ–ฅ ์ธ๋ฑ์Šค ๋งคํ•‘ (๋‚ด๋ถ€ ์ธ๋ฑ์Šค ID -> ์‚ฌ์šฉ์ž ID) + index_to_id = {} + for user_id, doc_info in self.document_store.items(): + index_to_id[doc_info['index']] = user_id + + for i, (distance, idx) in enumerate(zip(distances[0], indices[0])): + if idx == -1: # -1์€ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Œ์„ ์˜๋ฏธ + continue + + # ๋‚ด๋ถ€ ์ธ๋ฑ์Šค๋ฅผ ํ†ตํ•ด ์‚ฌ์šฉ์ž ID ์ฐพ๊ธฐ + if idx not in index_to_id: + continue + + user_id = index_to_id[idx] + doc_info = self.document_store[user_id] + + # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„๋กœ ๋ณ€ํ™˜ (์„ ํƒ์ ) + score = 1.0 / (1.0 + distance) + score = round(score, 3) + + # ๊ฒฐ๊ณผ ๋”•์…”๋„ˆ๋ฆฌ ์ƒ์„ฑ + result = { + 'text': doc_info['text'], + 'metadata': { + 'id': user_id, + **doc_info['metadata'] + }, + 'score': score + } + results.append(result) + + return results + + def delete( + self, + ids: Optional[List[str]] = None, + filters: Optional[Dict] = None, + **kwargs: Any, + ) -> bool: + """ + ์ธ๋ฑ์Šค์—์„œ ๋ฌธ์„œ ์‚ญ์ œ + + Args: + ids: ์‚ญ์ œํ•  ๋ฌธ์„œ์˜ ID ๋ชฉ๋ก + filters: ์‚ญ์ œํ•  ๋ฌธ์„œ๋ฅผ ํ•„ํ„ฐ๋งํ•˜๋Š” ์กฐ๊ฑด + **kwargs: ์ถ”๊ฐ€ ๋งค๊ฐœ๋ณ€์ˆ˜ + + Returns: + ์„ฑ๊ณต ์—ฌ๋ถ€๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๋ถˆ๋ฆฌ์–ธ ๊ฐ’ + """ + # ํ•„ํ„ฐ ๊ธฐ๋ฐ˜ ์‚ญ์ œ + if filters and not ids: + ids_to_delete = [] + for user_id, doc_info in self.document_store.items(): + match = True + for key, value in filters.items(): + if key not in doc_info['metadata'] or doc_info['metadata'][key] != value: + match = False + break + + if match: + ids_to_delete.append(user_id) + + if ids_to_delete: + return self.delete(ids=ids_to_delete) + return True + + # ID๊ฐ€ ์—†๊ณ  ํ•„ํ„ฐ๋„ ์—†์œผ๋ฉด ๋ชจ๋“  ๋ฌธ์„œ ์‚ญ์ œ + if ids is None and filters is None: + # ๋ชจ๋“  ๋ฌธ์„œ ์‚ญ์ œ - ์ธ๋ฑ์Šค๋ฅผ ์žฌ์„ค์ •ํ•ฉ๋‹ˆ๋‹ค + base_index = faiss.IndexFlatL2(self.dim) + self.index = faiss.IndexIDMap(base_index) + self.document_store = {} + self.next_id = 0 + return True + + # ํŠน์ • ID ์‚ญ์ œ + if ids: + # ์‚ญ์ œํ•  ID๊ฐ€ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ + ids_to_delete = [id for id in ids if id in self.document_store] + + if not ids_to_delete: + return True # ์‚ญ์ œํ•  ๊ฒƒ์ด ์—†์Œ + + # FAISS ๋‚ด๋ถ€ ID ๋ชฉ๋ก ์ถ”์ถœ + faiss_ids = [self.document_store[user_id]['index'] for user_id in ids_to_delete] + + # IndexIDMap์—์„œ ์ œ๊ณตํ•˜๋Š” remove_ids ๋ฉ”์„œ๋“œ ์‚ฌ์šฉ + try: + self.index.remove_ids(np.array(faiss_ids, dtype=np.int64)) + + # ๋ฌธ์„œ ์ €์žฅ์†Œ์—์„œ๋„ ์‚ญ์ œ + for user_id in ids_to_delete: + del self.document_store[user_id] + + return True + except Exception as e: + print(f"FAISS ์‚ญ์ œ ์˜ค๋ฅ˜: {e}") + return False \ No newline at end of file From 032f00e7c8232ce6e0b0fbca8e0e0fcc02fe3303 Mon Sep 17 00:00:00 2001 From: johnny9210 Date: Tue, 6 May 2025 15:27:39 +0900 Subject: [PATCH 2/3] [N-2] 11-Vector Store / 11 - Faiss --- 09-VectorStore/utils/faiss.py | 42 ++++++----------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/09-VectorStore/utils/faiss.py b/09-VectorStore/utils/faiss.py index 5efd50fbe..c67348a62 100644 --- a/09-VectorStore/utils/faiss.py +++ b/09-VectorStore/utils/faiss.py @@ -24,13 +24,11 @@ def __init__( self.dim = dim self.embedding = embedding - # FAISS ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™” - IndexIDMap์œผ๋กœ ๊ฐ์‹ธ์„œ ID ์ง€์› ์ถ”๊ฐ€ - base_index = faiss.IndexFlatL2(dim) # L2 ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ˜ ์ธ๋ฑ์Šค ์ƒ์„ฑ - self.index = faiss.IndexIDMap(base_index) # ID ๋งคํ•‘ ์ง€์› ์ถ”๊ฐ€ + base_index = faiss.IndexFlatL2(dim) + self.index = faiss.IndexIDMap(base_index) - # ID์™€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  ๋”•์…”๋„ˆ๋ฆฌ self.document_store = {} - self.next_id = 0 # ๋‚ด๋ถ€ ID ์นด์šดํ„ฐ + self.next_id = 0 def upsert( self, @@ -56,29 +54,22 @@ def upsert( if metadatas is None: metadatas = [{} for _ in range(len(texts_list))] - # ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ if self.embedding: embeddings = self.embedding.embed_documents(texts_list) else: - # ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ (์˜ˆ์‹œ๋กœ ๋žœ๋ค ๋ฒกํ„ฐ ์ƒ์„ฑ) embeddings = [np.random.rand(self.dim).astype('float32') for _ in texts_list] - # FAISS ์ธ๋ฑ์Šค์— ๋ฒกํ„ฐ์™€ ID ์ถ”๊ฐ€ vectors = np.array(embeddings).astype('float32') - # ID๋ฅผ ์ •์ˆ˜๋กœ ๋ณ€ํ™˜ (FAISS๋Š” ์ •์ˆ˜ ID๋งŒ ์ง€์›) int_ids = np.array([i + self.next_id for i in range(len(texts_list))], dtype=np.int64) - # ๋ฒกํ„ฐ์™€ ID ํ•จ๊ป˜ ์ถ”๊ฐ€ self.index.add_with_ids(vectors, int_ids) - # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์™€ ๋ฌธ์„œ ์ €์žฅ for i, (text, metadata, user_id) in enumerate(zip(texts_list, metadatas, ids)): - # ํ˜„์žฌ ์ธ๋ฑ์Šค๋Š” self.next_id + i idx = self.next_id + i self.document_store[user_id] = { - 'index': idx, # FAISS ์ธ๋ฑ์Šค ์ €์žฅ + 'index': idx, 'text': text, 'metadata': metadata } @@ -105,7 +96,6 @@ def upsert_parallel( workers: ์ž‘์—…์ž ์ˆ˜ **kwargs: ์ถ”๊ฐ€ ๋งค๊ฐœ๋ณ€์ˆ˜ """ - # ๋ฐฐ์น˜ ์ƒ์„ฑ texts_list = list(texts) total = len(texts_list) @@ -124,7 +114,6 @@ def upsert_parallel( for i in range(0, total, batch_size) ] - # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ with ThreadPoolExecutor(max_workers=workers) as executor: list(executor.map(lambda batch: self.upsert(*batch, **kwargs), batches)) @@ -140,46 +129,37 @@ def search(self, query: str, k: int = 10, **kwargs: Any) -> List[Dict]: Returns: ์œ ์‚ฌํ•œ ๋ฌธ์„œ ๋ฆฌ์ŠคํŠธ """ - if not self.document_store: # ๋ฌธ์„œ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ + if not self.document_store: return [] - # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ if self.embedding: query_embedding = self.embedding.embed_documents([query])[0] else: - # ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ (์˜ˆ์‹œ๋กœ ๋žœ๋ค ๋ฒกํ„ฐ ์ƒ์„ฑ) query_embedding = np.random.rand(self.dim).astype('float32') - # ๋ฒกํ„ฐ๋ฅผ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜ query_vector = np.array([query_embedding]).astype('float32') - # ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ - IndexIDMap์˜ ๊ฒฝ์šฐ ๊ฑฐ๋ฆฌ์™€ ํ•จ๊ป˜ ์‹ค์ œ ID ๋ฐ˜ํ™˜ distances, indices = self.index.search(query_vector, k) - # ๊ฒฐ๊ณผ ๋ณ€ํ™˜ results = [] - # ์—ญ๋ฐฉํ–ฅ ์ธ๋ฑ์Šค ๋งคํ•‘ (๋‚ด๋ถ€ ์ธ๋ฑ์Šค ID -> ์‚ฌ์šฉ์ž ID) index_to_id = {} for user_id, doc_info in self.document_store.items(): index_to_id[doc_info['index']] = user_id for i, (distance, idx) in enumerate(zip(distances[0], indices[0])): - if idx == -1: # -1์€ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Œ์„ ์˜๋ฏธ + if idx == -1: continue - # ๋‚ด๋ถ€ ์ธ๋ฑ์Šค๋ฅผ ํ†ตํ•ด ์‚ฌ์šฉ์ž ID ์ฐพ๊ธฐ if idx not in index_to_id: continue user_id = index_to_id[idx] doc_info = self.document_store[user_id] - # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„๋กœ ๋ณ€ํ™˜ (์„ ํƒ์ ) score = 1.0 / (1.0 + distance) score = round(score, 3) - # ๊ฒฐ๊ณผ ๋”•์…”๋„ˆ๋ฆฌ ์ƒ์„ฑ result = { 'text': doc_info['text'], 'metadata': { @@ -209,7 +189,6 @@ def delete( Returns: ์„ฑ๊ณต ์—ฌ๋ถ€๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๋ถˆ๋ฆฌ์–ธ ๊ฐ’ """ - # ํ•„ํ„ฐ ๊ธฐ๋ฐ˜ ์‚ญ์ œ if filters and not ids: ids_to_delete = [] for user_id, doc_info in self.document_store.items(): @@ -226,31 +205,24 @@ def delete( return self.delete(ids=ids_to_delete) return True - # ID๊ฐ€ ์—†๊ณ  ํ•„ํ„ฐ๋„ ์—†์œผ๋ฉด ๋ชจ๋“  ๋ฌธ์„œ ์‚ญ์ œ if ids is None and filters is None: - # ๋ชจ๋“  ๋ฌธ์„œ ์‚ญ์ œ - ์ธ๋ฑ์Šค๋ฅผ ์žฌ์„ค์ •ํ•ฉ๋‹ˆ๋‹ค base_index = faiss.IndexFlatL2(self.dim) self.index = faiss.IndexIDMap(base_index) self.document_store = {} self.next_id = 0 return True - # ํŠน์ • ID ์‚ญ์ œ if ids: - # ์‚ญ์ œํ•  ID๊ฐ€ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ ids_to_delete = [id for id in ids if id in self.document_store] if not ids_to_delete: - return True # ์‚ญ์ œํ•  ๊ฒƒ์ด ์—†์Œ + return True - # FAISS ๋‚ด๋ถ€ ID ๋ชฉ๋ก ์ถ”์ถœ faiss_ids = [self.document_store[user_id]['index'] for user_id in ids_to_delete] - # IndexIDMap์—์„œ ์ œ๊ณตํ•˜๋Š” remove_ids ๋ฉ”์„œ๋“œ ์‚ฌ์šฉ try: self.index.remove_ids(np.array(faiss_ids, dtype=np.int64)) - # ๋ฌธ์„œ ์ €์žฅ์†Œ์—์„œ๋„ ์‚ญ์ œ for user_id in ids_to_delete: del self.document_store[user_id] From eb34e20ea6bbfa08f971ec2ad343befe5d474088 Mon Sep 17 00:00:00 2001 From: SOHYEON Date: Tue, 6 May 2025 22:40:50 +0900 Subject: [PATCH 3/3] Rename 11-Faiss.ipynb to 03-Faiss.ipynb modify the title of juypter notebook --- 09-VectorStore/{11-Faiss.ipynb => 03-Faiss.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 09-VectorStore/{11-Faiss.ipynb => 03-Faiss.ipynb} (100%) diff --git a/09-VectorStore/11-Faiss.ipynb b/09-VectorStore/03-Faiss.ipynb similarity index 100% rename from 09-VectorStore/11-Faiss.ipynb rename to 09-VectorStore/03-Faiss.ipynb