From 1c827eba4866df9a4151797cf087bd19b0d70127 Mon Sep 17 00:00:00 2001
From: johnny9210 <jik9210@gmail.com>
Date: Tue, 6 May 2025 15:21:37 +0900
Subject: [PATCH 1/3] [N-2] 11-Vector Store / 11 - Faiss

---
 09-VectorStore/11-Faiss.ipynb | 818 ++++++++++++++++++++++++++++++++++
 09-VectorStore/utils/faiss.py | 260 +++++++++++
 2 files changed, 1078 insertions(+)
 create mode 100644 09-VectorStore/11-Faiss.ipynb
 create mode 100644 09-VectorStore/utils/faiss.py

diff --git a/09-VectorStore/11-Faiss.ipynb b/09-VectorStore/11-Faiss.ipynb
new file mode 100644
index 000000000..422dfdb63
--- /dev/null
+++ b/09-VectorStore/11-Faiss.ipynb
@@ -0,0 +1,818 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "25733da0",
+   "metadata": {},
+   "source": [
+    "# Faiss\n",
+    "\n",
+    "- Author: [Ilgyun Jeong](https://github.com/johnny9210)\n",
+    "- Design: \n",
+    "- Peer Review: \n",
+    "- This is a part of [LangChain Open Tutorial](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial)\n",
+    "\n",
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb) [![Open in GitHub](https://img.shields.io/badge/Open%20in%20GitHub-181717?style=flat-square&logo=github&logoColor=white)](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb)\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "This tutorial covers how to use ```Faiss``` with **LangChain** .\n",
+    "\n",
+    "```Faiss``` (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also includes supporting code for evaluation and parameter tuning.\n",
+    "\n",
+    "This tutorial walks you through using **CRUD** operations with the ```Faiss``` **storing** , **updating** , **deleting** documents, and performing **similarity-based retrieval** .\n",
+    "\n",
+    "### Table of Contents\n",
+    "\n",
+    "- [Overview](#overview)\n",
+    "- [Environment Setup](#environment-setup)\n",
+    "- [What is Faiss?](#what-is-Faiss?)\n",
+    "- [Data](#data)\n",
+    "- [Initial Setting Faiss](#initial-setting-Faiss)\n",
+    "- [Document Manager](#document-manager)\n",
+    "\n",
+    "\n",
+    "### References\n",
+    "\n",
+    "- [Faiss](https://engineering.fb.com/2017/03/29/data-infrastructure/Faiss-a-library-for-efficient-similarity-search/)\n",
+    "- [Faiss Library paper](https://arxiv.org/pdf/2401.08281)\n",
+    "- [Faiss documentation](https://Faiss.ai/)\n",
+    "- [Langchain Faiss document](https://python.langchain.com/docs/integrations/vectorstores/Faiss/)\n",
+    "----"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c1fac085",
+   "metadata": {},
+   "source": [
+    "## Environment Setup\n",
+    "\n",
+    "Set up the environment. You may refer to [Environment Setup](https://wikidocs.net/257836) for more details.\n",
+    "\n",
+    "**[Note]**\n",
+    "- ```langchain-opentutorial``` is a package that provides a set of easy-to-use environment setup, useful functions and utilities for tutorials. \n",
+    "- You can checkout the [```langchain-opentutorial```](https://github.com/LangChain-OpenTutorial/langchain-opentutorial-pypi) for more details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "98da7994",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture --no-stderr\n",
+    "%pip install langchain-opentutorial"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "69133474",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Install necessary package\n",
+    "%pip install -qU langchain-community Faiss-cpu\n",
+    "\n",
+    "# Note that you can also install Faiss-gpu if you want to use the GPU enabled version\n",
+    "# Install necessary package\n",
+    "# %pip install -qU langchain-community Faiss-gpu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "800c732b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Install required packages\n",
+    "from langchain_opentutorial import package\n",
+    "\n",
+    "package.install(\n",
+    "    [\n",
+    "        \"langsmith\",\n",
+    "        \"langchain\",\n",
+    "        \"langchain_core\",\n",
+    "        \"langchain_community\",\n",
+    "        \"faiss-cpu\",\n",
+    "    ],\n",
+    "    verbose=False,\n",
+    "    upgrade=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5b36bafa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Environment variables have been set successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Set environment variables\n",
+    "from langchain_opentutorial import set_env\n",
+    "\n",
+    "set_env(\n",
+    "    {\n",
+    "        \"OPENAI_API_KEY\": \"Your OPENAI API KEY\",\n",
+    "        \"LANGCHAIN_API_KEY\": \"Your LangChain API KEY\",\n",
+    "        \"LANGCHAIN_TRACING_V2\": \"true\",\n",
+    "        \"LANGCHAIN_ENDPOINT\": \"https://api.smith.langchain.com\",\n",
+    "        \"LANGCHAIN_PROJECT\": \"Faiss\",\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8011a0c7",
+   "metadata": {},
+   "source": [
+    "You can alternatively set API keys such as ```OPENAI_API_KEY``` in a ```.env``` file and load them.\n",
+    "\n",
+    "[Note] This is not necessary if you've already set the required API keys in previous steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "70d7e764",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(override=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "915cf401",
+   "metadata": {},
+   "source": [
+    "## What is Faiss?\n",
+    "\n",
+    "```Faiss``` (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors.\n",
+    "\n",
+    "* Core Concepts\n",
+    "  * ```Similarity search```: Finding vectors that are closest to a query vector\n",
+    "  * ```Scaling```: Handles vector sets of any size, including those exceeding RAM\n",
+    "  * ```Efficiency```: Optimized for memory usage and search speed\n",
+    "\n",
+    "* Vector Operations\n",
+    "  * ```Nearest neighbor```: Finding k vectors closest to a query vector\n",
+    "  * ```Maximum inner product```: Finding vectors with highest dot product\n",
+    "  * ```Clustering```: Grouping similar vectors together\n",
+    "\n",
+    "* Index Types\n",
+    "  * ```Flat```: Exact search with exhaustive comparison\n",
+    "  * ```IVF```: Inverted file structure for faster approximate search\n",
+    "  * ```HNSW```: Hierarchical navigable small world graphs for high-quality search\n",
+    "  * ```PQ```: Product quantization for memory compression\n",
+    "  * ```OPQ```: Optimized product quantization for better accuracy\n",
+    "\n",
+    "* Performance Metrics\n",
+    "  * ```Speed```: Query time for finding similar vectors\n",
+    "  * ```Memory```: RAM requirements for index storage\n",
+    "  * ```Accuracy```: How well results match exhaustive search (recall)\n",
+    "\n",
+    "* Technical Features\n",
+    "  * ```GPU support```: State-of-the-art GPU implementations with 5-20x speedup\n",
+    "  * ```Multi-threading```: Parallel processing across CPU cores\n",
+    "  * ```SIMD optimization```: Vectorized operations for faster computation\n",
+    "  * ```Half-precision```: Float16 support for better performance\n",
+    "\n",
+    "* Applications\n",
+    "  * ```Image similarity```: Finding visually similar images\n",
+    "  * ```Text embeddings```: Semantic search in document collections\n",
+    "  * ```Recommendation systems```: Finding similar items for users\n",
+    "  * ```Classification```: Computing maximum inner-products for classification"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f3b5bd2",
+   "metadata": {},
+   "source": [
+    "## Data\n",
+    "\n",
+    "This part walks you through the **data preparation process** .\n",
+    "\n",
+    "This section includes the following components:\n",
+    "\n",
+    "- Introduce Data\n",
+    "\n",
+    "- Preprocessing Data\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "508ae7f7",
+   "metadata": {},
+   "source": [
+    "### Introduce Data\n",
+    "\n",
+    "In this tutorial, we will use the fairy tale **📗 The Little Prince** in PDF format as our data.\n",
+    "\n",
+    "This material complies with the **Apache 2.0 license** .\n",
+    "\n",
+    "The data is used in a text (.txt) format converted from the original PDF.\n",
+    "\n",
+    "You can view the data at the link below.\n",
+    "- [Data Link](https://huggingface.co/datasets/sohyunwriter/the_little_prince)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "004ea4f4",
+   "metadata": {},
+   "source": [
+    "### Preprocessing Data\n",
+    "\n",
+    "In this tutorial section, we will preprocess the text data from The Little Prince and convert it into a list of ```LangChain Document``` objects with metadata. \n",
+    "\n",
+    "Each document chunk will include a ```title``` field in the metadata, extracted from the first line of each section."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8e4cac64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema import Document\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "import re\n",
+    "from typing import List\n",
+    "\n",
+    "\n",
+    "def preprocessing_data(content: str) -> List[Document]:\n",
+    "    # 1. Split the text by double newlines to separate sections\n",
+    "    blocks = content.split(\"\\n\\n\")\n",
+    "\n",
+    "    # 2. Initialize the text splitter\n",
+    "    text_splitter = RecursiveCharacterTextSplitter(\n",
+    "        chunk_size=500,  # Maximum number of characters per chunk\n",
+    "        chunk_overlap=50,  # Overlap between chunks to preserve context\n",
+    "        separators=[\"\\n\\n\", \"\\n\", \" \"],  # Order of priority for splitting\n",
+    "    )\n",
+    "\n",
+    "    documents = []\n",
+    "\n",
+    "    # 3. Loop through each section\n",
+    "    for block in blocks:\n",
+    "        lines = block.strip().splitlines()\n",
+    "        if not lines:\n",
+    "            continue\n",
+    "\n",
+    "        # Extract title from the first line using square brackets [ ]\n",
+    "        first_line = lines[0]\n",
+    "        title_match = re.search(r\"\\[(.*?)\\]\", first_line)\n",
+    "        title = title_match.group(1).strip() if title_match else None\n",
+    "\n",
+    "        # Remove the title line from content\n",
+    "        body = \"\\n\".join(lines[1:]).strip()\n",
+    "        if not body:\n",
+    "            continue\n",
+    "\n",
+    "        # 4. Chunk the section using the text splitter\n",
+    "        chunks = text_splitter.split_text(body)\n",
+    "\n",
+    "        # 5. Create a LangChain Document for each chunk with the same title metadata\n",
+    "        for chunk in chunks:\n",
+    "            documents.append(Document(page_content=chunk, metadata={\"title\": title}))\n",
+    "\n",
+    "    print(f\"Generated {len(documents)} chunked documents.\")\n",
+    "\n",
+    "    return documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1d091a51",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated 262 chunked documents.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the entire text file\n",
+    "with open(\"./data/the_little_prince.txt\", \"r\", encoding=\"utf-8\") as f:\n",
+    "    content = f.read()\n",
+    "\n",
+    "# Preprocessing Data\n",
+    "\n",
+    "docs = preprocessing_data(content=content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1977d4ff",
+   "metadata": {},
+   "source": [
+    "## Initial Setting Faiss\n",
+    "\n",
+    "This part walks you through the initial setup of ```Faiss``` .\n",
+    "\n",
+    "This section includes the following components:\n",
+    "\n",
+    "- Load Embedding Model\n",
+    "\n",
+    "- Load ```Faiss``` Client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7eee56b2",
+   "metadata": {},
+   "source": [
+    "### Load Embedding Model\n",
+    "\n",
+    "In the **Load Embedding Model** section, you'll learn how to load an embedding model.\n",
+    "\n",
+    "This tutorial uses **OpenAI's** **API-Key** for loading the model.\n",
+    "\n",
+    "*💡 If you prefer to use another embedding model, see the instructions below.*\n",
+    "- [Embedding Models](https://python.langchain.com/docs/integrations/text_embedding/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5bd5c3c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "\n",
+    "embedding = OpenAIEmbeddings(model=\"text-embedding-3-large\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40f65795",
+   "metadata": {},
+   "source": [
+    "### Load Faiss Client\n",
+    "\n",
+    "In the **Load ```Faiss``` Client** section, we cover how to load the **database client object** using the **Python SDK** for ```Faiss``` .\n",
+    "- [Faiss Python SDK Docs](https://github.com/facebookresearch/faiss/wiki/getting-started?utm_source=chatgpt.com)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "eed0ebad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create Database Client Object Function\n",
+    "import faiss\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def get_db_client(dim: int = 128):\n",
+    "    \"\"\"\n",
+    "\n",
+    "    Initializes and returns a VectorStore client instance.\n",
+    "\n",
+    "\n",
+    "    This function loads configuration (e.g., API key, host) from environment\n",
+    "\n",
+    "    variables or default values and creates a client object to interact\n",
+    "\n",
+    "    with the faiss Python SDK.\n",
+    "\n",
+    "\n",
+    "    Returns:\n",
+    "\n",
+    "        client:ClientType - An instance of the faiss client.\n",
+    "\n",
+    "\n",
+    "    Raises:\n",
+    "\n",
+    "        ValueError: If required configuration is missing.\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    base_index = faiss.IndexFlatL2(dim)  # L2 거리 기반 인덱스 생성\n",
+    "    client = faiss.IndexIDMap(base_index)  # ID 매핑 지원 추가\n",
+    "\n",
+    "    return client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2b5f4116",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get DB Client Object\n",
+    "client = get_db_client()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a5a97a0",
+   "metadata": {},
+   "source": [
+    "## Document Manager\n",
+    "\n",
+    "To support the **Langchain-Opentutorial** , we implemented a custom set of **CRUD** functionalities for VectorDBs. \n",
+    "\n",
+    "The following operations are included:\n",
+    "\n",
+    "- ```upsert``` : Update existing documents or insert if they don’t exist\n",
+    "\n",
+    "- ```upsert_parallel``` : Perform upserts in parallel for large-scale data\n",
+    "\n",
+    "- ```similarity_search``` : Search for similar documents based on embeddings\n",
+    "\n",
+    "- ```delete``` : Remove documents based on filter conditions\n",
+    "\n",
+    "Each of these features is implemented as class methods specific to each VectorDB.\n",
+    "\n",
+    "In this tutorial, you can easily utilize these methods to interact with your VectorDB.\n",
+    "\n",
+    "*We plan to continuously expand the functionality by adding more common operations in the future.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65a40601",
+   "metadata": {},
+   "source": [
+    "### Create Instance\n",
+    "\n",
+    "First, we create an instance of the ```faiss``` helper class to use its CRUD functionalities.\n",
+    "\n",
+    "This class is initialized with the **```faiss``` Python SDK client instance**, **index name** and the **embedding model instance** , both of which were defined in the previous section."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "dccab807",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import FaissCRUDManager\n",
+    "from utils.faiss import FaissCRUDManager\n",
+    "\n",
+    "# connect to tutorial_index\n",
+    "crud_manager = FaissCRUDManager(dim=3072, embedding=embedding)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5859c412",
+   "metadata": {},
+   "source": [
+    "Now you can use the following **CRUD** operations with the ```crud_manager``` instance.\n",
+    "\n",
+    "These instance allow you to easily manage documents in your ```faiss``` ."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c6c53c5",
+   "metadata": {},
+   "source": [
+    "### Upsert Document\n",
+    "\n",
+    "**Update** existing documents or **insert** if they don’t exist\n",
+    "\n",
+    "**✅ Args**\n",
+    "\n",
+    "- ```texts``` : Iterable[str] – List of text contents to be inserted/updated.\n",
+    "\n",
+    "- ```metadatas``` : Optional[List[Dict]] – List of metadata dictionaries for each text (optional).\n",
+    "\n",
+    "- ```ids``` : Optional[List[str]] – Custom IDs for the documents. If not provided, IDs will be auto-generated.\n",
+    "\n",
+    "- ```**kwargs``` : Extra arguments for the underlying vector store.\n",
+    "\n",
+    "**🔄 Return**\n",
+    "\n",
+    "- None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "f3a6c32b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from uuid import uuid4\n",
+    "\n",
+    "# Create ID for each document\n",
+    "ids = [str(uuid4()) for _ in docs]\n",
+    "\n",
+    "args = {\n",
+    "    \"texts\": [doc.page_content for doc in docs[:2]],\n",
+    "    \"metadatas\": [doc.metadata for doc in docs[:2]],\n",
+    "    \"ids\": ids[:2],\n",
+    "    # if you want args, add params.\n",
+    "}\n",
+    "\n",
+    "crud_manager.upsert(**args)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "278fe1ed",
+   "metadata": {},
+   "source": [
+    "### Upsert Parallel Document\n",
+    "\n",
+    "Perform **upserts** in **parallel** for large-scale data\n",
+    "\n",
+    "**✅ Args**\n",
+    "\n",
+    "- ```texts``` : Iterable[str] – List of text contents to be inserted/updated.\n",
+    "\n",
+    "- ```metadatas``` : Optional[List[Dict]] – List of metadata dictionaries for each text (optional).\n",
+    "\n",
+    "- ```ids``` : Optional[List[str]] – Custom IDs for the documents. If not provided, IDs will be auto-generated.\n",
+    "\n",
+    "- ```batch_size``` : int – Number of documents per batch (default: 32).\n",
+    "\n",
+    "- ```workers``` : int – Number of parallel workers (default: 10).\n",
+    "\n",
+    "- ```**kwargs``` : Extra arguments for the underlying vector store.\n",
+    "\n",
+    "**🔄 Return**\n",
+    "\n",
+    "- None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "a89dd8e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from uuid import uuid4\n",
+    "\n",
+    "args = {\n",
+    "    \"texts\": [doc.page_content for doc in docs],\n",
+    "    \"metadatas\": [doc.metadata for doc in docs],\n",
+    "    \"ids\": ids,\n",
+    "    # if you want args, add params.\n",
+    "}\n",
+    "\n",
+    "crud_manager.upsert_parallel(**args)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6beea197",
+   "metadata": {},
+   "source": [
+    "### Similarity Search\n",
+    "\n",
+    "Search for **similar documents** based on **embeddings** .\n",
+    "\n",
+    "This method uses **\"cosine similarity\"** .\n",
+    "\n",
+    "\n",
+    "**✅ Args**\n",
+    "\n",
+    "- ```query``` : str – The text query for similarity search.\n",
+    "\n",
+    "- ```k``` : int – Number of top results to return (default: 10).\n",
+    "\n",
+    "```**kwargs``` : Additional search options (e.g., filters).\n",
+    "\n",
+    "**🔄 Return**\n",
+    "\n",
+    "- ```results``` : List[Document] – A list of LangChain Document objects ranked by similarity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "5859782b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Rank 1\n",
+      "Contents : And he went back to meet the fox. \n",
+      "\"Goodbye,\" he said. \n",
+      "\"Goodbye,\" said the fox. \"And now here is my secret, a very simple secret: It is only with the heart that one can see rightly; what is essential is invisible to the eye.\" \n",
+      "\"What is essential is invisible to the eye,\" the little prince repeated, so that he would be sure to remember.\n",
+      "\"It is the time you have wasted for your rose that makes your rose so important.\"\n",
+      "Metadata: {'id': 'f3377018-0d2c-45b0-baec-75eb071ef5f5', 'title': 'Chapter 21'}\n",
+      "Similarity Score: 0.504\n",
+      "\n",
+      "Rank 2\n",
+      "Contents : \"Yes,\" I said to the little prince. \"The house, the stars, the desert-- what gives them their beauty is something that is invisible!\" \n",
+      "\"I am glad,\" he said, \"that you agree with my fox.\"\n",
+      "Metadata: {'id': '88bc73d6-a7f5-424c-9a6d-6aa496b5254a', 'title': 'Chapter 24'}\n",
+      "Similarity Score: 0.498\n",
+      "\n",
+      "Rank 3\n",
+      "Contents : \"The men where you live,\" said the little prince, \"raise five thousand roses in the same garden-- and they do not find in it what they are looking for.\" \n",
+      "\"They do not find it,\" I replied. \n",
+      "\"And yet what they are looking for could be found in one single rose, or in a little water.\" \n",
+      "\"Yes, that is true,\" I said. \n",
+      "And the little prince added: \n",
+      "\"But the eyes are blind. One must look with the heart...\"\n",
+      "Metadata: {'id': 'e616d38d-b757-487d-9a5b-b164a69efb15', 'title': 'Chapter 25'}\n",
+      "Similarity Score: 0.464\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Search by Query\n",
+    "\n",
+    "results = crud_manager.search(query=\"What is essential is invisible to the eye.\", k=3)\n",
+    "for idx, result in enumerate(results):\n",
+    "    print(f\"Rank {idx+1}\")\n",
+    "    print(f\"Contents : {result['text']}\")\n",
+    "    print(f\"Metadata: {result['metadata']}\")\n",
+    "    print(f\"Similarity Score: {result['score']}\")\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9ad0ed0c",
+   "metadata": {},
+   "source": [
+    "### Delete Document\n",
+    "\n",
+    "Remove documents based on filter conditions\n",
+    "\n",
+    "**✅ Args**\n",
+    "\n",
+    "- ```ids``` : Optional[List[str]] – List of document IDs to delete. If None, deletion is based on filter.\n",
+    "\n",
+    "- ```filters``` : Optional[Dict] – Dictionary specifying filter conditions (e.g., metadata match).\n",
+    "\n",
+    "- ```**kwargs``` : Any additional parameters.\n",
+    "\n",
+    "**🔄 Return**\n",
+    "\n",
+    "- Boolean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "0e3a2c33",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Delete by ids\n",
+    "\n",
+    "ids = ids[:5]  # The 'ids' value you want to delete\n",
+    "crud_manager.delete(ids=ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "60bcb4cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Delete by ids with filters\n",
+    "\n",
+    "filters = {\"page\": 6}\n",
+    "crud_manager.delete(filters={\"title\": \"chapter 6\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "30d42d2e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Delete All\n",
+    "\n",
+    "crud_manager.delete()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4663706b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/09-VectorStore/utils/faiss.py b/09-VectorStore/utils/faiss.py
new file mode 100644
index 000000000..5efd50fbe
--- /dev/null
+++ b/09-VectorStore/utils/faiss.py
@@ -0,0 +1,260 @@
+from utils.vectordbinterface import DocumentManager
+from utils.vectordbinterface import Iterable, Any, Optional, List, Dict
+from langchain_core.documents import Document
+import faiss
+import numpy as np
+from uuid import uuid4
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Any, Iterable
+
+
+class FaissCRUDManager(DocumentManager):
+    def __init__(
+        self, dim: int = 768, embedding: Optional[Any] = None, **kwargs
+    ) -> None:
+        """
+        FAISS 벡터 데이터베이스 매니저 초기화
+        
+        Args:
+            dim: 임베딩 벡터의 차원
+            embedding: 선택적 임베딩 함수
+            **kwargs: 추가 인자들
+        """
+        super().__init__()
+        self.dim = dim
+        self.embedding = embedding
+        
+        # FAISS 인덱스 초기화 - IndexIDMap으로 감싸서 ID 지원 추가
+        base_index = faiss.IndexFlatL2(dim)  # L2 거리 기반 인덱스 생성
+        self.index = faiss.IndexIDMap(base_index)  # ID 매핑 지원 추가
+        
+        # ID와 메타데이터를 저장할 딕셔너리
+        self.document_store = {}
+        self.next_id = 0  # 내부 ID 카운터
+    
+    def upsert(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[Dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        텍스트를 임베딩하고 FAISS 인덱스에 추가
+        
+        Args:
+            texts: 문서 또는 텍스트
+            metadatas: 메타데이터
+            ids: 고유 ID, None이면 자동 생성
+            **kwargs: 추가 매개변수
+        """
+        texts_list = list(texts)
+        
+        if ids is None:
+            ids = [str(uuid4()) for _ in range(len(texts_list))]
+        
+        if metadatas is None:
+            metadatas = [{} for _ in range(len(texts_list))]
+        
+        # 텍스트 임베딩
+        if self.embedding:
+            embeddings = self.embedding.embed_documents(texts_list)
+        else:
+            # 임베딩 함수가 없는 경우 처리 (예시로 랜덤 벡터 생성)
+            embeddings = [np.random.rand(self.dim).astype('float32') for _ in texts_list]
+        
+        # FAISS 인덱스에 벡터와 ID 추가
+        vectors = np.array(embeddings).astype('float32')
+        
+        # ID를 정수로 변환 (FAISS는 정수 ID만 지원)
+        int_ids = np.array([i + self.next_id for i in range(len(texts_list))], dtype=np.int64)
+        
+        # 벡터와 ID 함께 추가
+        self.index.add_with_ids(vectors, int_ids)
+        
+        # 메타데이터와 문서 저장
+        for i, (text, metadata, user_id) in enumerate(zip(texts_list, metadatas, ids)):
+            # 현재 인덱스는 self.next_id + i
+            idx = self.next_id + i
+            
+            self.document_store[user_id] = {
+                'index': idx,  # FAISS 인덱스 저장
+                'text': text,
+                'metadata': metadata
+            }
+        
+        self.next_id += len(texts_list)
+
+    def upsert_parallel(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[Dict]] = None,
+        ids: Optional[List[str]] = None,
+        batch_size: int = 32,
+        workers: int = 10,
+        **kwargs: Any,
+    ) -> None:
+        """
+        텍스트를 병렬로 처리하여 임베딩하고 FAISS 인덱스에 추가
+        
+        Args:
+            texts: 문서 또는 텍스트
+            metadatas: 메타데이터
+            ids: 고유 ID, None이면 자동 생성
+            batch_size: 배치 크기
+            workers: 작업자 수
+            **kwargs: 추가 매개변수
+        """
+        # 배치 생성
+        texts_list = list(texts)
+        total = len(texts_list)
+        
+        if ids is None:
+            ids = [str(uuid4()) for _ in range(total)]
+            
+        if metadatas is None:
+            metadatas = [{} for _ in range(total)]
+        
+        batches = [
+            (
+                texts_list[i : i + batch_size],
+                metadatas[i : i + batch_size] if metadatas else None,
+                ids[i : i + batch_size] if ids else None,
+            )
+            for i in range(0, total, batch_size)
+        ]
+        
+        # 병렬 처리
+        with ThreadPoolExecutor(max_workers=workers) as executor:
+            list(executor.map(lambda batch: self.upsert(*batch, **kwargs), batches))
+
+    def search(self, query: str, k: int = 10, **kwargs: Any) -> List[Dict]:
+        """
+        쿼리와 가장 유사한 문서 검색
+        
+        Args:
+            query: 검색 쿼리
+            k: 반환할 결과 수
+            **kwargs: 필터링 옵션
+        
+        Returns:
+            유사한 문서 리스트
+        """
+        if not self.document_store:  # 문서가 없는 경우
+            return []
+            
+        # 쿼리 임베딩
+        if self.embedding:
+            query_embedding = self.embedding.embed_documents([query])[0]
+        else:
+            # 임베딩 함수가 없는 경우 처리 (예시로 랜덤 벡터 생성)
+            query_embedding = np.random.rand(self.dim).astype('float32')
+        
+        # 벡터를 numpy 배열로 변환
+        query_vector = np.array([query_embedding]).astype('float32')
+        
+        # 검색 수행 - IndexIDMap의 경우 거리와 함께 실제 ID 반환
+        distances, indices = self.index.search(query_vector, k)
+        
+        # 결과 변환
+        results = []
+        
+        # 역방향 인덱스 매핑 (내부 인덱스 ID -> 사용자 ID)
+        index_to_id = {}
+        for user_id, doc_info in self.document_store.items():
+            index_to_id[doc_info['index']] = user_id
+        
+        for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
+            if idx == -1:  # -1은 결과가 없음을 의미
+                continue
+            
+            # 내부 인덱스를 통해 사용자 ID 찾기
+            if idx not in index_to_id:
+                continue
+                
+            user_id = index_to_id[idx]
+            doc_info = self.document_store[user_id]
+            
+            # 코사인 유사도로 변환 (선택적)
+            score = 1.0 / (1.0 + distance)
+            score = round(score, 3)
+            
+            # 결과 딕셔너리 생성
+            result = {
+                'text': doc_info['text'],
+                'metadata': {
+                    'id': user_id,
+                    **doc_info['metadata']
+                },
+                'score': score
+            }
+            results.append(result)
+        
+        return results
+
+    def delete(
+        self,
+        ids: Optional[List[str]] = None,
+        filters: Optional[Dict] = None,
+        **kwargs: Any,
+    ) -> bool:
+        """
+        인덱스에서 문서 삭제
+        
+        Args:
+            ids: 삭제할 문서의 ID 목록
+            filters: 삭제할 문서를 필터링하는 조건
+            **kwargs: 추가 매개변수
+            
+        Returns:
+            성공 여부를 나타내는 불리언 값
+        """
+        # 필터 기반 삭제
+        if filters and not ids:
+            ids_to_delete = []
+            for user_id, doc_info in self.document_store.items():
+                match = True
+                for key, value in filters.items():
+                    if key not in doc_info['metadata'] or doc_info['metadata'][key] != value:
+                        match = False
+                        break
+                
+                if match:
+                    ids_to_delete.append(user_id)
+            
+            if ids_to_delete:
+                return self.delete(ids=ids_to_delete)
+            return True
+        
+        # ID가 없고 필터도 없으면 모든 문서 삭제
+        if ids is None and filters is None:
+            # 모든 문서 삭제 - 인덱스를 재설정합니다
+            base_index = faiss.IndexFlatL2(self.dim)
+            self.index = faiss.IndexIDMap(base_index)
+            self.document_store = {}
+            self.next_id = 0
+            return True
+        
+        # 특정 ID 삭제
+        if ids:
+            # 삭제할 ID가 존재하는지 확인
+            ids_to_delete = [id for id in ids if id in self.document_store]
+            
+            if not ids_to_delete:
+                return True  # 삭제할 것이 없음
+            
+            # FAISS 내부 ID 목록 추출
+            faiss_ids = [self.document_store[user_id]['index'] for user_id in ids_to_delete]
+            
+            # IndexIDMap에서 제공하는 remove_ids 메서드 사용
+            try:
+                self.index.remove_ids(np.array(faiss_ids, dtype=np.int64))
+                
+                # 문서 저장소에서도 삭제
+                for user_id in ids_to_delete:
+                    del self.document_store[user_id]
+                
+                return True
+            except Exception as e:
+                print(f"FAISS 삭제 오류: {e}")
+                return False
\ No newline at end of file

From 032f00e7c8232ce6e0b0fbca8e0e0fcc02fe3303 Mon Sep 17 00:00:00 2001
From: johnny9210 <jik9210@gmail.com>
Date: Tue, 6 May 2025 15:27:39 +0900
Subject: [PATCH 2/3] [N-2] 11-Vector Store / 11 - Faiss

---
 09-VectorStore/utils/faiss.py | 42 ++++++-----------------------------
 1 file changed, 7 insertions(+), 35 deletions(-)

diff --git a/09-VectorStore/utils/faiss.py b/09-VectorStore/utils/faiss.py
index 5efd50fbe..c67348a62 100644
--- a/09-VectorStore/utils/faiss.py
+++ b/09-VectorStore/utils/faiss.py
@@ -24,13 +24,11 @@ def __init__(
         self.dim = dim
         self.embedding = embedding
         
-        # FAISS 인덱스 초기화 - IndexIDMap으로 감싸서 ID 지원 추가
-        base_index = faiss.IndexFlatL2(dim)  # L2 거리 기반 인덱스 생성
-        self.index = faiss.IndexIDMap(base_index)  # ID 매핑 지원 추가
+        base_index = faiss.IndexFlatL2(dim)  
+        self.index = faiss.IndexIDMap(base_index)  
         
-        # ID와 메타데이터를 저장할 딕셔너리
         self.document_store = {}
-        self.next_id = 0  # 내부 ID 카운터
+        self.next_id = 0  
     
     def upsert(
         self,
@@ -56,29 +54,22 @@ def upsert(
         if metadatas is None:
             metadatas = [{} for _ in range(len(texts_list))]
         
-        # 텍스트 임베딩
         if self.embedding:
             embeddings = self.embedding.embed_documents(texts_list)
         else:
-            # 임베딩 함수가 없는 경우 처리 (예시로 랜덤 벡터 생성)
             embeddings = [np.random.rand(self.dim).astype('float32') for _ in texts_list]
         
-        # FAISS 인덱스에 벡터와 ID 추가
         vectors = np.array(embeddings).astype('float32')
         
-        # ID를 정수로 변환 (FAISS는 정수 ID만 지원)
         int_ids = np.array([i + self.next_id for i in range(len(texts_list))], dtype=np.int64)
         
-        # 벡터와 ID 함께 추가
         self.index.add_with_ids(vectors, int_ids)
         
-        # 메타데이터와 문서 저장
         for i, (text, metadata, user_id) in enumerate(zip(texts_list, metadatas, ids)):
-            # 현재 인덱스는 self.next_id + i
             idx = self.next_id + i
             
             self.document_store[user_id] = {
-                'index': idx,  # FAISS 인덱스 저장
+                'index': idx,  
                 'text': text,
                 'metadata': metadata
             }
@@ -105,7 +96,6 @@ def upsert_parallel(
             workers: 작업자 수
             **kwargs: 추가 매개변수
         """
-        # 배치 생성
         texts_list = list(texts)
         total = len(texts_list)
         
@@ -124,7 +114,6 @@ def upsert_parallel(
             for i in range(0, total, batch_size)
         ]
         
-        # 병렬 처리
         with ThreadPoolExecutor(max_workers=workers) as executor:
             list(executor.map(lambda batch: self.upsert(*batch, **kwargs), batches))
 
@@ -140,46 +129,37 @@ def search(self, query: str, k: int = 10, **kwargs: Any) -> List[Dict]:
         Returns:
             유사한 문서 리스트
         """
-        if not self.document_store:  # 문서가 없는 경우
+        if not self.document_store:  
             return []
             
-        # 쿼리 임베딩
         if self.embedding:
             query_embedding = self.embedding.embed_documents([query])[0]
         else:
-            # 임베딩 함수가 없는 경우 처리 (예시로 랜덤 벡터 생성)
             query_embedding = np.random.rand(self.dim).astype('float32')
         
-        # 벡터를 numpy 배열로 변환
         query_vector = np.array([query_embedding]).astype('float32')
         
-        # 검색 수행 - IndexIDMap의 경우 거리와 함께 실제 ID 반환
         distances, indices = self.index.search(query_vector, k)
         
-        # 결과 변환
         results = []
         
-        # 역방향 인덱스 매핑 (내부 인덱스 ID -> 사용자 ID)
         index_to_id = {}
         for user_id, doc_info in self.document_store.items():
             index_to_id[doc_info['index']] = user_id
         
         for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
-            if idx == -1:  # -1은 결과가 없음을 의미
+            if idx == -1:  
                 continue
             
-            # 내부 인덱스를 통해 사용자 ID 찾기
             if idx not in index_to_id:
                 continue
                 
             user_id = index_to_id[idx]
             doc_info = self.document_store[user_id]
             
-            # 코사인 유사도로 변환 (선택적)
             score = 1.0 / (1.0 + distance)
             score = round(score, 3)
             
-            # 결과 딕셔너리 생성
             result = {
                 'text': doc_info['text'],
                 'metadata': {
@@ -209,7 +189,6 @@ def delete(
         Returns:
             성공 여부를 나타내는 불리언 값
         """
-        # 필터 기반 삭제
         if filters and not ids:
             ids_to_delete = []
             for user_id, doc_info in self.document_store.items():
@@ -226,31 +205,24 @@ def delete(
                 return self.delete(ids=ids_to_delete)
             return True
         
-        # ID가 없고 필터도 없으면 모든 문서 삭제
         if ids is None and filters is None:
-            # 모든 문서 삭제 - 인덱스를 재설정합니다
             base_index = faiss.IndexFlatL2(self.dim)
             self.index = faiss.IndexIDMap(base_index)
             self.document_store = {}
             self.next_id = 0
             return True
         
-        # 특정 ID 삭제
         if ids:
-            # 삭제할 ID가 존재하는지 확인
             ids_to_delete = [id for id in ids if id in self.document_store]
             
             if not ids_to_delete:
-                return True  # 삭제할 것이 없음
+                return True  
             
-            # FAISS 내부 ID 목록 추출
             faiss_ids = [self.document_store[user_id]['index'] for user_id in ids_to_delete]
             
-            # IndexIDMap에서 제공하는 remove_ids 메서드 사용
             try:
                 self.index.remove_ids(np.array(faiss_ids, dtype=np.int64))
                 
-                # 문서 저장소에서도 삭제
                 for user_id in ids_to_delete:
                     del self.document_store[user_id]
                 

From eb34e20ea6bbfa08f971ec2ad343befe5d474088 Mon Sep 17 00:00:00 2001
From: SOHYEON <brightcattle@gmail.com>
Date: Tue, 6 May 2025 22:40:50 +0900
Subject: [PATCH 3/3] Rename 11-Faiss.ipynb to 03-Faiss.ipynb

modify the title of juypter notebook
---
 09-VectorStore/{11-Faiss.ipynb => 03-Faiss.ipynb} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename 09-VectorStore/{11-Faiss.ipynb => 03-Faiss.ipynb} (100%)

diff --git a/09-VectorStore/11-Faiss.ipynb b/09-VectorStore/03-Faiss.ipynb
similarity index 100%
rename from 09-VectorStore/11-Faiss.ipynb
rename to 09-VectorStore/03-Faiss.ipynb