From dcb15731e2f7a4f416a83682f69d66353ebc8319 Mon Sep 17 00:00:00 2001 From: Jongho Lee Date: Wed, 15 Jan 2025 19:16:14 +0900 Subject: [PATCH 1/3] [N-2] 09-VectorStore / 09-Neo4j Tutorial about basic usage of Neo4j as a Vector DB, NOT GraphDB. neo4j_interface.py contains utility functions for Neo4j built upon neo4j python sdk. Not completed... --- 09-VectorStore/09-Neo4j.ipynb | 1048 +++++++++++++++++++++++ 09-VectorStore/utils/neo4j_interface.py | 841 ++++++++++++++++++ 2 files changed, 1889 insertions(+) create mode 100644 09-VectorStore/09-Neo4j.ipynb create mode 100644 09-VectorStore/utils/neo4j_interface.py diff --git a/09-VectorStore/09-Neo4j.ipynb b/09-VectorStore/09-Neo4j.ipynb new file mode 100644 index 000000000..284c54f15 --- /dev/null +++ b/09-VectorStore/09-Neo4j.ipynb @@ -0,0 +1,1048 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "z47qcQPDHZq_" + }, + "source": [ + "# Neo4j Vector Index\n", + "\n", + "- Author: [Jongho](https://github.com/XaviereKU)\n", + "- Design: \n", + "- Peer Review: \n", + "- This is a part of [LangChain Open Tutorial](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial)\n", + "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb) [![Open in GitHub](https://img.shields.io/badge/Open%20in%20GitHub-181717?style=flat-square&logo=github&logoColor=white)](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb)\n", + "\n", + "## Overview\n", + "Neo4j is a Graph database backed by vector store and can be deployed locally or on cloud.\n", + "\n", + "In this tutorial we utilize its ability to store vectors only, and deal with its real ability, Graph database, later.\n", + "\n", + "To encode data into vector, we use ```OpenAI Embedding```, but you can use any embedding you want.\n", + "\n", + "Furthermore, you need to note that you should read about ```Cypher```, declarative query language for Neo4j, to fully utilize Neo4j.\n", + "\n", + "We use some Cypher queries but will not go deeply. You can visit Cypher official document web site in References.\n", + "\n", + "For more information, visit [Neo4j](https://neo4j.com/).\n", + "\n", + "### Table of Contents\n", + "\n", + "- [Overview](#overview)\n", + "- [Environment Setup](#environment-setup)\n", + "- [Setup Neo4j](#setup-neo4j)\n", + "\t- [Getting started with Aura](#getting-started-with-aura)\n", + "\t- [Getting started with Docker](#getting-started-with-docker)\n", + "- [Credentials](#credentials)\n", + "- [Initialization](#initialization)\n", + "\t- [List Indexes](#list-indexs)\n", + "\t- [Create Index](#create-index)\n", + "\t- [Delete Index](#delete-index)\n", + "\t- [Select Embedding model](#select-embeddings-model)\n", + "\t- [Data Preprocessing](#data-preprocessing)\n", + "- [Manage vector store](#manage-vector-store)\n", + "\t- [Add items to vector store](#add-items-to-vector-store)\n", + "\t- [Delete items from vector store](#delete-items-from-vector-store)\n", + "\t- [Scroll items from vector store](#scroll-items-from-vector-store)\n", + "\t- [(Advanced)Scroll items with query](#advanced-scroll-items-with-query)\n", + "\n", + "### References\n", + "\n", + "- [Cypher](https://neo4j.com/docs/cypher-manual/current/introduction/)\n", + "- [Neo4j Docker Installation](https://hub.docker.com/_/neo4j)\n", + "- [Neo4j Official Installation guide](https://neo4j.com/docs/operations-manual/current/installation/)\n", + "- [Neo4j Python SDK document](https://neo4j.com/docs/api/python-driver/current/index.html)\n", + "- [Neo4j document](https://neo4j.com/docs/)\n", + "- [Langchain Neo4j document](https://python.langchain.com/docs/integrations/vectorstores/neo4jvector/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wEk1SoFEfwjo" + }, + "source": [ + "## Environment Setup\n", + "\n", + "Set up the environment. You may refer to [Environment Setup](https://wikidocs.net/257836) for more details.\n", + "\n", + "**[Note]**\n", + "- ```langchain-opentutorial``` is a package that provides a set of easy-to-use environment setup, useful functions and utilities for tutorials. \n", + "- You can checkout the [```langchain-opentutorial```](https://github.com/LangChain-OpenTutorial/langchain-opentutorial-pypi) for more details.\n", + "- We built ```Neo4jDB``` class from Python SDK of ```Neo4j```. Langchain also supports neo4j vector store class but it lacks some methods like delete. Look neo4j_interface.py in utils" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "n9NVKk-Zf9Nq" + }, + "outputs": [], + "source": [ + "%%capture --no-stderr\n", + "%pip install langchain-opentutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Pip install necessary package\n", + "%pip install -qU neo4j" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "IMx2hZNXf9QL" + }, + "outputs": [], + "source": [ + "# Install required packages\n", + "from langchain_opentutorial import package\n", + "\n", + "package.install(\n", + " [\n", + " \"langsmith\",\n", + " \"langchain\",\n", + " \"langchain_core\",\n", + " \"langchain_community\",\n", + " \"langchain_openai\",\n", + " \"neo4j\",\n", + " \"nltk\",\n", + " \"graphdatascience\",\n", + " ],\n", + " verbose=False,\n", + " upgrade=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "N8C6pLTZf9Sb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Environment variables have been set successfully.\n" + ] + } + ], + "source": [ + "# Set environment variables\n", + "from langchain_opentutorial import set_env\n", + "\n", + "set_env(\n", + " {\n", + " \"OPENAI_API_KEY\": \"Your OepnAI API Key\",\n", + " \"LANGCHAIN_API_KEY\": \"Your LangChain API Key\",\n", + " \"LANGCHAIN_TRACING_V2\": \"true\",\n", + " \"LANGCHAIN_ENDPOINT\": \"https://api.smith.langchain.com\",\n", + " \"LANGCHAIN_PROJECT\": \"Neo4j\",\n", + " \"NEO4J_URI\": \"Your NEO4J Aura URI\",\n", + " \"NEO4J_USERNAME\": \"Your NEO4J Aura username\",\n", + " \"NEO4J_PASSWORD\": \"Your NEO4J Aura password\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can alternatively set API keys such as ```OPENAI_API_KEY``` in a ```.env``` file and load them.\n", + "\n", + "[Note] This is not necessary if you've already set the required API keys in previous steps." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load API keys from .env file\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(override=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Neo4j\n", + "We have two options to start with. Cloud or local deployment.\n", + "\n", + "In this tutorial, we will user Cloud service, called ```Aura``` provided by ```Neo4j```.\n", + "\n", + "But we will also describe how to deploy ```Neo4j``` with docker.\n", + "\n", + "### Getting started with Aura\n", + "You can create a new ```Neo4j Aura``` account at [Neo4j](https://neo4j.com/) offical website.\n", + "\n", + "Visit web site and click Get Started Free at top right.\n", + "\n", + "### Getting started with Docker\n", + "We now describe how to run ```Neo4j``` using docker.\n", + "\n", + "To run Neo4j container, we use the following command.\n", + "```\n", + "docker run \\\n", + " -itd \\\n", + " --publish=7474:7474 --publish=7687:7687 \\\n", + " --volume=$HOME/neo4j/data:/data \\\n", + " --env=NEO4J_AUTH=none \\\n", + " --name neo4j \\\n", + " neo4j\n", + "```\n", + "\n", + "You can visit **Neo4j Docker installation** reference to check more detailed information.\n", + "\n", + "**[NOTE]**\n", + "* ```Neo4j``` also supports macOS, windows and Linux native deployment. Visit **Neo4j Official Installation guide** reference for more detail.\n", + "* ```Neo4j``` community edition only supports one database." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "USvgdgjznDsd" + }, + "source": [ + "## Credentials\n", + "Now, if you successfully create your own account for Aura, you will get your ```NEO4J_URI```, ```NEO4J_USERNAME```, ```NEO4J_USERPASSWORD```.\n", + "\n", + "Add it to environmental variable above or add it to your ```.env``` file." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from importlib import reload\n", + "import utils.neo4j_interface as ni\n", + "\n", + "reload(ni)\n", + "Neo4jDB = ni.Neo4jDB" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "QzFkuokSnL1e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected to Neo4j database\n", + "Connection info\n", + "URI=neo4j+s://3ed1167e.databases.neo4j.io\n", + "username=neo4j\n", + "Neo4j version is above 5.23\n" + ] + } + ], + "source": [ + "import os\n", + "import time\n", + "from utils.neo4j_interface import Neo4jDB\n", + "\n", + "# set uri, username, password\n", + "uri = os.getenv(\"NEO4J_URI\")\n", + "username = os.getenv(\"NEO4J_USERNAME\")\n", + "password = os.getenv(\"NEO4J_PASSWORD\")\n", + "\n", + "client = Neo4jDB(uri=uri, username=username, password=password)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we established connection to Aura ```Neo4j``` database, connection info using ```get_api_key``` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get connection info\n", + "client.get_api_key()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MWKVm4IkgHPn" + }, + "source": [ + "## Initialization\n", + "If you are succesfully connected to ```Neo4j Aura```, there are some basic indexes already created.\n", + "\n", + "But, in this tutorial we will create a new indexand will add items(nodes) to it.\n", + "\n", + "To do this, we now look how to manage indexes.\n", + "\n", + "To manage indexes, we will see how to:\n", + "* List indexes\n", + "* Create new index\n", + "* Delete index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mdvN2pRtzpHB" + }, + "source": [ + "### List Indexs\n", + "Before create a new index, let's check indexes already in the ```Neo4j``` database" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "8AgTNAl5zo3E" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['index_343aff4e', 'index_f7700477']\n" + ] + } + ], + "source": [ + "# get name list of indexes\n", + "names = client.list_indexes()\n", + "\n", + "print(names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R94h5sY7oLyh" + }, + "source": [ + "### Create Index\n", + "\n", + "Now we will create a new index.\n", + "\n", + "This can be done by calling `create_index` method, which will return an object connected to newly created index.\n", + "\n", + "If an index exists with the same name, the method will print out notification.\n", + "\n", + "When we create a new index, we must provide embedding object or dimension of vector, and ```metric``` to use for similarity search.\n", + "\n", + "In this tutorial we will pass `OpenAIEmbeddings` when we create a new index.\n", + "\n", + "\n", + "**[ NOTE ]**\n", + "- If you pass dimension of vector instead of embedding object, this must match the dimension of embeded vector of your choice of embedding model.\n", + "- An embedding object must have ```embed_query``` and ```embed_documents``` methods.\n", + "- ```metric``` is used to set distance method for similarity search. ```Neo4j``` supports ```cosine``` and ```euclidean```." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "tR3bb-F5hCf9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created index information\n", + "Index name: tutorial_index\n", + "Node label: tutorial_node\n", + "Similarity metric: COSINE\n", + "Embedding dimension: 1536\n", + "Embedding node property: embedding\n", + "Text node property: text\n", + "\n", + "Index creation was successful\n", + "['index_343aff4e', 'index_f7700477', 'tutorial_index']\n" + ] + } + ], + "source": [ + "# Initialize OpenAIEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "# set index_name and node_label\n", + "index_name = \"tutorial_index\"\n", + "node_label = \"tutorial_node\"\n", + "\n", + "# create a new index\n", + "index = client.create_index(\n", + " embedding=embeddings, index_name=index_name, node_label=node_label\n", + ")\n", + "\n", + "if isinstance(index, Neo4jDB):\n", + " print(\"Index creation was successful\")\n", + "\n", + "# check name list of indexes\n", + "names = client.list_indexes()\n", + "print(names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AKYPHPk40c4X" + }, + "source": [ + "### Delete Index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ua5yewan0TVy" + }, + "source": [ + "We can delete specific index by calling `delete_index` method.\n", + "\n", + "Delete ```tutorial_index``` we created above and then create it again to use later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['index_343aff4e', 'index_f7700477']\n", + "Created index information\n", + "Index name: tutorial_index\n", + "Node label: tutorial_node\n", + "Similarity metric: COSINE\n", + "Embedding dimension: 1536\n", + "Embedding node property: embedding\n", + "Text node property: text\n", + "\n" + ] + } + ], + "source": [ + "# delete index\n", + "client.delete_index(\"tutorial_index\")\n", + "\n", + "# print name list of indexes\n", + "names = client.list_indexes()\n", + "if 'tutorial_index' not in names:\n", + " print(f\"Index deleted succesfully \")\n", + " print(names)\n", + "\n", + "# recreate the tutorial_index\n", + "index = client.create_index(\n", + " embedding=embeddings, index_name=\"tutorial_index\", node_label=\"tutorial_node\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lwb_OMHunjwh" + }, + "source": [ + "### Select Embeddings model\n", + "\n", + "We also can change embedding model.\n", + "\n", + "In this subsection we use ```text-embedding-3-large``` model to create a new index with it" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "tRjB4EvXnoZM" + }, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "if not os.environ.get(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n", + "\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created index information\n", + "Index name: tutorial_index_2\n", + "Node label: tutorial_node_2\n", + "Similarity metric: COSINE\n", + "Embedding dimension: 3072\n", + "Embedding node property: embedding\n", + "Text node property: text\n", + "\n" + ] + } + ], + "source": [ + "# create new index\n", + "index2 = client.create_index(\n", + " embedding=embeddings, index_name=\"tutorial_index_2\", node_label=\"tutorial_node_2\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FtUJ2xSPrq3P" + }, + "source": [ + "### Data Preprocessing\n", + "\n", + "Below is the preprocessing process for general documents.\n", + "\n", + "- Need to extract ```metadata``` from documents\n", + "- Filter documents by minimum length.\n", + " \n", + "- Determine whether to use ```basename``` or not. Default is ```False```.\n", + " - ```basename``` denotes the last value of the filepath.\n", + " - For example, ```document.pdf``` will be the ```basename``` for the filepath ```./data/document.pdf```." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "bmQI6bOsvJbu" + }, + "outputs": [], + "source": [ + "# This is a long document we can split up.\n", + "data_path = \"./data/the_little_prince.txt\"\n", + "with open(data_path) as f:\n", + " raw_text = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VYdoMc4zvPe2", + "outputId": "83cff661-a0bd-4ac3-d5dd-52c61ebd1627" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='The Little Prince\n", + "Written By Antoine de Saiot-Exupery (1900〜1944)'\n" + ] + } + ], + "source": [ + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "# define text splitter\n", + "text_splitter = RecursiveCharacterTextSplitter(\n", + " # Set a really small chunk size, just to show.\n", + " chunk_size=100,\n", + " chunk_overlap=20,\n", + " length_function=len,\n", + " is_separator_regex=False,\n", + ")\n", + "\n", + "# split raw text by splitter.\n", + "split_docs = text_splitter.create_documents([raw_text])\n", + "\n", + "# print one of documents to check its structure\n", + "print(split_docs[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we preprocess splited document to extract author, page and source metadata while fit the data to store it into `Neo4j`" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "gGJ_bBJcw5af" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='The Little Prince\n", + "Written By Antoine de Saiot-Exupery (1900〜1944)' metadata={'source': 'the_little_prince.txt', 'page': 1, 'author': 'Saiot-Exupery'}\n" + ] + } + ], + "source": [ + "# preprocess raw documents\n", + "processed_docs = client.preprocess_documents(\n", + " split_docs=split_docs,\n", + " metadata_keys=[\"source\", \"page\", \"author\"],\n", + " min_length=5,\n", + " use_basename=True,\n", + " source=data_path,\n", + " author=\"Saiot-Exupery\",\n", + ")\n", + "\n", + "# print one of preprocessed document to chekc its structure\n", + "print(processed_docs[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1B7xc2p3lXPV" + }, + "source": [ + "## Manage vector store\n", + "Once you have created your vector store, we can interact with it by adding and deleting different items.\n", + "\n", + "Also, you can scroll data from the store with filter or with ```Cypher``` query." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kxxP2ecAohWb" + }, + "source": [ + "\n", + "### Add items to vector store\n", + "\n", + "We can add items to our vector store by using the ```upsert_documents``` or ```upsert_documents_parallel``` method.\n", + "\n", + "If you pass ids along with documents, then ids will be used, but if you do not pass ids, it will be created based `page_content` using md5 hash function.\n", + "\n", + "Basically, ```upsert_document``` and ```upsert_document_parallel``` methods do upsert not insert, based on **id** of the item.\n", + "\n", + "So if you provided id and want to update data, you must provide the same id that you provided at first upsertion.\n", + "\n", + "We will upsert data to index, tutorial_index, with ```upsert_documents``` method for the first half, and with ```upsert_documents_parallel``` for the second half." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "7cS0FHgalwPm" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "542a5d379f074057bcbbfbe05a134d23", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upserting documents...: 0%| | 0/22 [00:00= (5, 23, 0): + return True + else: + return False + + def connect(self) -> None: + """Connect to neo4j graph database. + If connection cannot be established, raise error + If connection established succesfully, prints connection info and return None + """ + return self.client + + def get_api_key(self): + return { + "NEO4J_URI": self.uri, + "NEO4J_USERNAME": self.username, + "NEO4J_PASSWORD": self.password, + } + + def create_index( + self, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + if index_name in self.list_indexes(): + print(f"index {index_name} exists") + return self._return_exist_index( + self.client, + uri=self.uri, + username=self.username, + password=self.password, + embedding=embedding, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + index_name=index_name, + metric=metric, + node_label=node_label, + _database=_database, + ) + + return self._create_new_index( + self.client, + uri=self.uri, + username=self.username, + password=self.password, + embedding=embedding, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + index_name=index_name, + metric=metric, + node_label=node_label, + _database=_database, + ) + + @classmethod + def _return_exist_index(cls, + client, + uri, + username, + password, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs,): + return cls( + uri=uri, + username=username, + password=password, + embedding=embedding, + index_name=index_name, + node_label=node_label, + _database=_database, + metric=metric, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + ) + + @classmethod + def _create_new_index( + cls, + client, + uri, + username, + password, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + """Create new vector index in Neo4j. + + Args: + - index_name : Index name for new index. Default is `vector` + - node_label : Node label for nodes in the index. Default is `Chunk` + - embedding_node_property : Name for embedding. Default is `embedding` + - metric : Distance used to calculate similarity. Default is `cosine`. + Supports `cosine`, `euclidean`. + + Returns: + - returns True if index is created successfully + """ + + assert ( + metric in METRIC.keys() + ), f"Choose metric among {list(METRIC.keys())}. Your metric is {metric}" + + if embedding is None and kwargs.get("dimension", None) is None: + raise ValueError( + "You must provide either embedding function or dimension of resulting vector when you encode a document with your choice of embedding function." + ) + + if "dimension" in kwargs: + dimension = kwargs["dimension"] + else: + dimension = len(embedding.embed_query("foo")) + index_name = index_name + node_label = node_label + metric = METRIC[metric] + + index_query = ( + f"CREATE VECTOR INDEX {index_name} IF NOT EXISTS " + f"FOR (m:`{node_label}`) ON m.`{embedding_node_property}` " + "OPTIONS { indexConfig: { " + "`vector.dimensions`: toInteger($embedding_dimension), " + "`vector.similarity_function`: $similarity_metric }}" + ) + + parameters = { + "embedding_dimension": dimension, + "similarity_metric": metric, + } + + try: + client.execute_query( + index_query, parameters_=parameters, database=_database + ) + except Exception as e: + print("Failed to create index") + print(e) + + else: + info_str = ( + f"Index name: {index_name}\n" + f"Node label: {node_label}\n" + f"Similarity metric: {metric}\n" + f"Embedding dimension: {dimension}\n" + f"Embedding node property: {embedding_node_property}\n" + f"Text node property: {text_node_property}\n" + ) + print("Created index information") + print(info_str) + return cls( + uri=uri, + username=username, + password=password, + embedding=embedding, + index_name=index_name, + node_label=node_label, + _database=_database, + metric=metric, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + ) + + @classmethod + def _connect_to_index(cls, client, embedding, index_name, node_label): + return cls(index_name=index_name, embedding=embedding, node_label=node_label) + + def connect_to_index(self, index_name, embedding=None): + """Connect to existing index + Args: + - index_name: Name of index to connect + + Return: + - Neo4jDB instance + """ + query = f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + node_label = self.client.execute_query(query).records[0]["labelsOrTypes"][0] + + if embedding is not None: + self.embedding = embedding + + return self._connect_to_index( + self.client, self.embedding, index_name, node_label + ) + + def list_indexes(self): + """Get list of index in current Neo4j database. + Returns: + - list of index names + """ + + query = """ + SHOW INDEXES + """ + + indexes = self.client.execute_query(query) + + result = [record["name"] for record in indexes.records] + + return result + + def get_index(self, index_name: str) -> Dict: + """Get information for given index name + + Args: + - index_name : index name to get information. + + Returns: + Information about the index. + """ + query = f""" + SHOW INDEXES YIELD * WHERE name='{index_name}' + """ + + try: + result = self.client.execute_query(query) + except Exception as e: + print("error occured while get index information") + raise e + else: + if len(result.records) == 0: + return None + result = {k: result.records[0][k] for k in result.keys} + return result + + def delete_index(self, index_name: str) -> Union[bool, None]: + """Delete index + + Args: + - index_name : index name to delete. + + Returns: + True if index deleted successfully. + If error occured, will raise error. + """ + query = f"DROP INDEX {index_name}" + if self.get_index(index_name) is None: + return f"{index_name} does not exists" + + try: + self.client.execute_query(query) + except Exception as e: + print(f"Drop index {index_name} failed") + raise e + else: + return True + + # Query related functions + def query(self, index_name, query_vector=None, top_k=10, **kwargs): + pass + + def delete_node( + self, + index_name: str = None, + filters: List[Dict] = None, + ids: List = None, + **kwargs, + ) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - filters: Delete nodes matching these filters + - ids: Delete nodes matching these ids + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + if filters is None and ids is None: + raise AssertionError("You must provide one of filters or ids") + elif filters is not None and ids is not None: + raise AssertionError("You must provide only one of filters or ids") + + if filters is not None: + return self.delete_by_filter(index_name, filters) + elif ids is not None: + return self.delete_by_id(index_name, ids) + + def delete_by_id(self, index_name: str = None, ids: List = None, **kwargs) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - ids: Delete nodes matching these ids + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + + if index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if ids is not None: + if not isinstance(ids, list): + ids = [ids] + filter_query = f"n.id IN {ids}" + + query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" + + try: + self.client.execute_query(query) + except Exception as e: + print("Delete by filter failed") + raise e + else: + return True + + def delete_by_filter( + self, index_name: str = None, filters: Dict = None, **kwargs + ) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - filters: Delete nodes matching these filters + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + + if index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if filters is not None: + filter_queries = [] + for k, v in filters.items(): + if not isinstance(v, list): + v = [v] + filter_queries.append(f"n.{k} IN {v}") + filter_query = " AND ".join(filter_queries) + + query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" + + try: + self.client.execute_query(query) + except Exception as e: + print("Delete by filter failed") + raise e + else: + return True + + # Document upsert related functions + + def add_embedding(self, documents: list[Document], ids: list[str] = []) -> list: + """Encode documents + Args: + - documents: List of documents to upsert into the vectorstore + - ids: List of ids for each documents. If not provided, md5 hash function will created based on the text of each document. + + Returns: Returns (encoded_vectors, id, metadata) tuple list + """ + + texts = [doc.page_content for doc in documents] + + if not all(ids): + ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] + + metadatas = [doc.metadata for doc in documents] + + encoded = self.embedding.embed_documents(texts) + + return (texts, encoded, ids, metadatas) + + def _insert_documents(self, documents: list[Document], ids: list[str] = []): + """util function for upsert_document. + + Args: + - documents: List of Document to upsert to database + - ids: List of ids paired with documents. If not provided will be created by md5 hash function. + + Return: + - ids: List of ids upserted documents. If ids were provided this must be the same to the ids provided. + """ + + texts, encodes, ids, metadatas = self.add_embedding(documents, ids) + + parameters = { + "data": [ + {"text": text, "metadata": metadata, "embedding": encode, "id": id} + for text, metadata, encode, id in zip(texts, metadatas, encodes, ids) + ] + } + + if self.is_neo4j_above_523: + call_prefix = "CALL (row) { " + else: + call_prefix = "CALL { WITH row " + + import_query = ( + "UNWIND $data AS row " + f"{call_prefix}" + f"MERGE (c:`{self.node_label}` {{id: row.id}}) " + "WITH c, row " + f"CALL db.create.setNodeVectorProperty(c, " + f"'{self.embedding_node_property}', row.embedding) " + f"SET c.`{self.text_node_property}` = row.text " + "SET c += row.metadata " + "} IN TRANSACTIONS OF 1000 ROWS " + ) + try: + self.client.execute_query(import_query, parameters_=parameters) + except Exception as e: + if "can only be executed in an implicit transaction" in str(e): + self.client.session().run(neo4j.Query(text=import_query), parameters) + elif "failed to obtain a connection from the pool" in str(e): + time.sleep(10) + self.client.session().run(neo4j.Query(text=import_query), parameters) + + return ids + + def upsert_documents(self, documents, batch_size=32, ids=None, **kwargs): + """Upsert documents into the vectorstore + + Args: + - documents: List of documents to upsert into the vectorstore + - batch_size: Batch size of documents to add or update. Default is 32. + - kwargs: Additional keyword arguments. + if kwargs contains ids and documents contain ids, + the ids in the kwargs will receive precedence. + + Returns: + Returns list of ids of the documents upserted. + """ + assert self.index_name is not None, "You MUST connect to index first." + + if self.node_label is None and self.index_name is not None: + self.node_label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + + if ids is not None: + assert len(ids) == len( + documents + ), "Size of documents and ids must be the same" + + else: + ids = [False] * len(documents) + + id_batches = [ids[i : i + batch_size] for i in range(0, len(ids), batch_size)] + + if batch_size > len(documents): + batch_size = len(documents) + + doc_batches = [ + documents[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + + result_ids = [] + + for doc_batch, id_batch in zip(doc_batches, id_batches): + result_ids.extend(self._insert_documents(doc_batch, id_batch)) + + return result_ids + + def upsert_documents_parallel( + self, documents, batch_size=32, max_workers=10, ids = None, **kwargs + ): + """Add or update documents in the vectorstore parallel. + + Args: + documents: Documents to add to the vectorstore. + batch_size: Batch size of documents to add or update. + Default is 32. + max_workers: Number of threads to use. + Default is 10. + kwargs: Additional keyword arguments. + if kwargs contains ids and documents contain ids, + the ids in the kwargs will receive precedence. + + Returns: + List of IDs of the added texts. + + Raises: + ValueError: If the number of ids does not match the number of documents. + """ + assert self.index_name is not None, "You MUST connect to index first." + + if ids is not None: + assert len(ids) == len( + documents + ), "Size of documents and ids must be the same" + + if batch_size > len(documents): + batch_size = len(documents) + + doc_bathces = [ + documents[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + id_batches = [ + ids[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + + if self.node_label is None and self.index_name is not None: + self.node_label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(self.upsert_documents, batch, ids=ids) for batch, ids in zip(doc_bathces, id_batches) + ] + results = [] + for future in tqdm( + as_completed(futures), total=len(futures), desc="Upserting documents..." + ): + result = future.result() + if result: + results.extend(result) + + return results + + def delete_by_query( + self, query: str, index_name: Optional[str | None] = None, **kwrags + ) -> bool: + """Delete nodes by query + Args: + - query: Cypher query + - index_name: Optional. Default is None. If specified, will delete node only in the given index + + Returns: + - True if deletion is successful else raise error + """ + try: + self.client.execute_query(query) + except Exception as e: + print(f"Error {e} occured during deletion") + raise e + else: + return True + + def scroll_by_query( + self, query: str, index_name: Optional[str | None] = None, **kwargs + ) -> List: + """Scroll nodes by query + Args: + - query: Cypher query + - index_name: Optional. Default is None. If specified, will scroll node only in the given index + + Returns: + - List of nodes if successful, else raise error + """ + print(query) + try: + _result = self.client.execute_query(query) + except Exception as e: + print(f"Error {e} occured during scroll") + raise e + else: + result = [] + for record in _result.records: + result.append({k: record[k] for k in record.keys()}) + return result + + def scroll_by_filter( + self, + filters=None, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: + """Query nodes by filter or id + If none of filter or id provided, will return all nodes. + If this method is called directly from client without index_name set, all nodes will be returned. + + Args: + - filters: filter for query data + - ids: id for query data + - limit: number of nodes to return + - include_embedding: Set True to include embedded vector to result. Default is False + - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. + + Returns: + - list of nodes + """ + + if self.index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if filters is None and ids is None: + filter_query = "" + + elif filters is not None: + filter_queries = [] + for k, v in filters.items(): + if not isinstance(v, list): + v = [v] + filter_queries.append(f"n.{k} IN {v}") + filter_query = " AND ".join(filter_queries) + + elif ids is not None: + filter_queries = [] + filter_query = f"n.id IN {ids}" + + limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" + + if filter_query != "": + query = prefix_query + " WHERE " + filter_query + limit_query + else: + query = prefix_query + limit_query + + _results = self.client.execute_query(query) + + results = list() + + for _result in _results.records: + node = _result["n"] + + result = {"id": node["id"]} + if include_embedding: + result.update({"embedding": node["embedding"]}) + if include_meta is None: + include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] + result.update( + {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} + ) + results.append(result) + + return results + + def scroll_by_ids( + self, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: + """Query nodes by filter or id + If none of filter or id provided, will return all nodes. + If this method is called directly from client without index_name set, all nodes will be returned. + + Args: + - ids: id for query data + - limit: number of nodes to return + - include_embedding: Set True to include embedded vector to result. Default is False + - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. + + Returns: + - list of nodes + """ + + if self.index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if ids is not None: + if not isinstance(ids, list): + ids = [ids] + filter_queries = [] + filter_query = f"n.id IN {ids}" + + limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" + + query = prefix_query + " WHERE " + filter_query + limit_query + + _results = self.client.execute_query(query) + + results = list() + + for _result in _results.records: + node = _result["n"] + + result = {"id": node["id"]} + if include_embedding: + result.update({"embedding": node["embedding"]}) + if include_meta is None: + include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] + result.update( + {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} + ) + results.append(result) + + return results + + def scroll_nodes(self, filters=None, ids=None, query=None, limit=10, include_embedding=False, include_meta=None, **kwargs): + if filters is not None: + print('by filter') + return self.scroll_by_filter(filters=filters, include_embedding=include_embedding, include_meta=include_meta, limit=limit) + elif ids is not None: + print('by ids') + return self.scroll_by_ids(ids=ids, include_embedding=include_embedding, include_meta=include_meta, limit=limit) + elif query is not None: + print('by query') + return self.scroll_by_query(query=query) + else: + return self.scroll_by_filter(include_embedding=include_embedding, include_meta=include_meta, limit=limit) + + @staticmethod + def preprocess_documents( + split_docs, metadata_keys, min_length, use_basename=False, **kwargs + ): + metadata = kwargs + + if use_basename: + assert metadata.get("source", None) is not None, "source must be provided" + metadata['source'] = metadata['source'].split('/')[-1] + + result_docs = [] + for idx, doc in enumerate(split_docs): + if len(doc.page_content) < min_length: + continue + for k in metadata_keys: + doc.metadata.update({k: metadata.get(k, "")}) + doc.metadata.update({"page": idx + 1}) + result_docs.append(doc) + + return result_docs From bbadd2a6f28336c6850bc8b37c08fc23c8ffc409 Mon Sep 17 00:00:00 2001 From: Jongho Lee Date: Fri, 17 Jan 2025 00:12:58 +0900 Subject: [PATCH 2/3] [N-2] 09-VectorStore / 09-Neo4j Add import time Delete unused import Add more detailed information for Neo4j Aura sign in --- 09-VectorStore/09-Neo4j.ipynb | 120 +++++++++++------------ 09-VectorStore/utils/neo4j_interface.py | 122 ++++++++++++++---------- 2 files changed, 129 insertions(+), 113 deletions(-) diff --git a/09-VectorStore/09-Neo4j.ipynb b/09-VectorStore/09-Neo4j.ipynb index 284c54f15..214463a97 100644 --- a/09-VectorStore/09-Neo4j.ipynb +++ b/09-VectorStore/09-Neo4j.ipynb @@ -20,7 +20,7 @@ "\n", "In this tutorial we utilize its ability to store vectors only, and deal with its real ability, Graph database, later.\n", "\n", - "To encode data into vector, we use ```OpenAI Embedding```, but you can use any embedding you want.\n", + "To encode data into vector, we use ```OpenAIEmbedding```, but you can use any embedding you want.\n", "\n", "Furthermore, you need to note that you should read about ```Cypher```, declarative query language for Neo4j, to fully utilize Neo4j.\n", "\n", @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": { "id": "n9NVKk-Zf9Nq" }, @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": { "id": "IMx2hZNXf9QL" }, @@ -124,7 +124,6 @@ " \"langchain_openai\",\n", " \"neo4j\",\n", " \"nltk\",\n", - " \"graphdatascience\",\n", " ],\n", " verbose=False,\n", " upgrade=False,\n", @@ -133,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": { "id": "N8C6pLTZf9Sb" }, @@ -152,14 +151,14 @@ "\n", "set_env(\n", " {\n", - " \"OPENAI_API_KEY\": \"Your OepnAI API Key\",\n", - " \"LANGCHAIN_API_KEY\": \"Your LangChain API Key\",\n", + " \"OPENAI_API_KEY\": \"Your OpenAI API KEY\",\n", + " \"LANGCHAIN_API_KEY\": \"Your LangChain API KEY\",\n", " \"LANGCHAIN_TRACING_V2\": \"true\",\n", " \"LANGCHAIN_ENDPOINT\": \"https://api.smith.langchain.com\",\n", " \"LANGCHAIN_PROJECT\": \"Neo4j\",\n", - " \"NEO4J_URI\": \"Your NEO4J Aura URI\",\n", - " \"NEO4J_USERNAME\": \"Your NEO4J Aura username\",\n", - " \"NEO4J_PASSWORD\": \"Your NEO4J Aura password\",\n", + " \"NEO4J_URI\": \"Your Neo4j URI\",\n", + " \"NEO4J_USERNAME\": \"Your Neo4j Username\",\n", + " \"NEO4J_PASSWORD\": \"Your Neo4j Password\",\n", " }\n", ")" ] @@ -175,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -184,7 +183,7 @@ "False" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -208,10 +207,14 @@ "But we will also describe how to deploy ```Neo4j``` with docker.\n", "\n", "### Getting started with Aura\n", - "You can create a new ```Neo4j Aura``` account at [Neo4j](https://neo4j.com/) offical website.\n", + "You can create a new **Neo4j Aura** account at [Neo4j](https://neo4j.com/) offical website.\n", "\n", "Visit web site and click Get Started Free at top right.\n", "\n", + "If you done signing in, you will se a button, **Create instance** and after that you will see your username and password.\n", + "\n", + "To get your API Key, click **Download and continue** to download a txt file which contains API key to connect your **NEO4j Aura** .\n", + "\n", "### Getting started with Docker\n", "We now describe how to run ```Neo4j``` using docker.\n", "\n", @@ -247,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -260,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 27, "metadata": { "id": "QzFkuokSnL1e" }, @@ -271,7 +274,7 @@ "text": [ "Connected to Neo4j database\n", "Connection info\n", - "URI=neo4j+s://3ed1167e.databases.neo4j.io\n", + "URI=neo4j+s://977d36e1.databases.neo4j.io\n", "username=neo4j\n", "Neo4j version is above 5.23\n" ] @@ -314,7 +317,7 @@ }, "source": [ "## Initialization\n", - "If you are succesfully connected to ```Neo4j Aura```, there are some basic indexes already created.\n", + "If you are succesfully connected to **Neo4j Aura**, there are some basic indexes already created.\n", "\n", "But, in this tutorial we will create a new indexand will add items(nodes) to it.\n", "\n", @@ -338,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "metadata": { "id": "8AgTNAl5zo3E" }, @@ -380,12 +383,12 @@ "**[ NOTE ]**\n", "- If you pass dimension of vector instead of embedding object, this must match the dimension of embeded vector of your choice of embedding model.\n", "- An embedding object must have ```embed_query``` and ```embed_documents``` methods.\n", - "- ```metric``` is used to set distance method for similarity search. ```Neo4j``` supports ```cosine``` and ```euclidean```." + "- ```metric``` is used to set distance method for similarity search. ```Neo4j``` supports **cosine** and **euclidean** ." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "metadata": { "id": "tR3bb-F5hCf9" }, @@ -452,13 +455,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "Index deleted succesfully \n", "['index_343aff4e', 'index_f7700477']\n", "Created index information\n", "Index name: tutorial_index\n", @@ -477,7 +481,7 @@ "\n", "# print name list of indexes\n", "names = client.list_indexes()\n", - "if 'tutorial_index' not in names:\n", + "if \"tutorial_index\" not in names:\n", " print(f\"Index deleted succesfully \")\n", " print(names)\n", "\n", @@ -502,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "metadata": { "id": "tRjB4EvXnoZM" }, @@ -521,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -556,17 +560,17 @@ "\n", "Below is the preprocessing process for general documents.\n", "\n", - "- Need to extract ```metadata``` from documents\n", + "- Need to extract **metadata** from documents\n", "- Filter documents by minimum length.\n", " \n", "- Determine whether to use ```basename``` or not. Default is ```False```.\n", " - ```basename``` denotes the last value of the filepath.\n", - " - For example, ```document.pdf``` will be the ```basename``` for the filepath ```./data/document.pdf```." + " - For example, **document.pdf** will be the ```basename``` for the filepath **./data/document.pdf** ." ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": { "id": "bmQI6bOsvJbu" }, @@ -580,7 +584,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -626,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "metadata": { "id": "gGJ_bBJcw5af" }, @@ -689,7 +693,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 17, "metadata": { "id": "7cS0FHgalwPm" }, @@ -697,7 +701,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "542a5d379f074057bcbbfbe05a134d23", + "model_id": "177d94aa3ab8485cba915995106519bb", "version_major": 2, "version_minor": 0 }, @@ -719,13 +723,11 @@ ], "source": [ "from uuid import uuid4\n", - "from langchain_core.documents import Document\n", "\n", "# make ids for each document\n", "uuids = [str(uuid4()) for _ in range(len(processed_docs))]\n", "\n", "\n", - "\n", "# upsert documents\n", "total_number = len(processed_docs)\n", "upsert_result = index.upsert_documents(\n", @@ -746,7 +748,7 @@ "print(len(result))\n", "\n", "# check manual ids are the same as output ids\n", - "print(\"Manual Ids == Output Ids:\",sorted(result)==sorted(uuids))\n" + "print(\"Manual Ids == Output Ids:\", sorted(result) == sorted(uuids))" ] }, { @@ -760,12 +762,12 @@ "We can delete nodes by filter or ids with `delete_node` method.\n", "\n", "\n", - "For example, we will delete the first page, that is `page` 1, of the little prince, and try to scroll it." + "For example, we will delete **the first page**, that is `page` 1, of the little prince, and try to scroll it." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 18, "metadata": { "id": "OWmeKCqLo3ht" }, @@ -789,14 +791,14 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "by filter\n", + "Scroll nodes by filter\n", "[]\n" ] } @@ -821,7 +823,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -840,26 +842,26 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': '48c5061b-b45f-4116-8594-dd1ad375caf6',\n", + "{'id': '00936d12-fa47-4d48-bd13-51766696f942',\n", " 'metadata': {'author': 'Saiot-Exupery',\n", " 'text': '[ Antoine de Saiot-Exupery ]',\n", " 'source': 'the_little_prince.txt',\n", " 'page': 2}}" ] }, - "execution_count": 29, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sorted(result, key = lambda x: x['metadata']['page'])[0]" + "sorted(result, key=lambda x: x[\"metadata\"][\"page\"])[0]" ] }, { @@ -871,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -893,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -922,15 +924,15 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "by filter\n", - "[{'id': '11732600-72fe-4d87-b191-66e31f4e8731', 'metadata': {'author': 'Saiot-Exupery', 'text': 'learned to fly a plane. Five years later, he would leave the military in order to begin flying air', 'source': 'the_little_prince.txt', 'page': 10}}]\n" + "Scroll nodes by filter\n", + "[{'id': 'e2f6114c-0d1c-4cd1-8ee1-b0834e48eabc', 'metadata': {'author': 'Saiot-Exupery', 'text': 'learned to fly a plane. Five years later, he would leave the military in order to begin flying air', 'source': 'the_little_prince.txt', 'page': 10}}]\n" ] } ], @@ -945,15 +947,15 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "by ids\n", - "[{'id': '8922addd-f92a-4c1b-a34b-673ef9ac8289', 'metadata': {'author': 'Saiot-Exupery', 'text': 'For Saint-Exupéry, it was a grand adventure - one with dangers lurking at every corner. Flying his', 'source': 'the_little_prince.txt', 'page': 12}}]\n" + "Scroll nodes by ids\n", + "[{'id': '883d6d11-f484-4615-b328-8d794b65a235', 'metadata': {'author': 'Saiot-Exupery', 'text': 'For Saint-Exupéry, it was a grand adventure - one with dangers lurking at every corner. Flying his', 'source': 'the_little_prince.txt', 'page': 12}}]\n" ] } ], @@ -975,15 +977,14 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "by query\n", - "MATCH (n) WHERE n.page IN [10,11,12] AND n.author='Saiot-Exupery' RETURN n.page, n.author, n.text\n", + "Scroll nodes by query\n", "{'n.page': 10, 'n.author': 'Saiot-Exupery', 'n.text': 'learned to fly a plane. Five years later, he would leave the military in order to begin flying air'}\n", "{'n.page': 11, 'n.author': 'Saiot-Exupery', 'n.text': 'to begin flying air mail between remote settlements in the Sahara desert.'}\n", "{'n.page': 12, 'n.author': 'Saiot-Exupery', 'n.text': 'For Saint-Exupéry, it was a grand adventure - one with dangers lurking at every corner. Flying his'}\n" @@ -1009,15 +1010,8 @@ "\n", "You can now do the basics of how to use Neo4j.\n", "\n", - "If you want more advanced tasks, please refer to `Neo4j` official API documents and office Python SDK of `Neo4j` API documents." + "If you want to do more advanced tasks, please refer to `Neo4j` official API documents and official Python SDK of `Neo4j` API documents." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/09-VectorStore/utils/neo4j_interface.py b/09-VectorStore/utils/neo4j_interface.py index 8c36482ea..88c6c8eb0 100644 --- a/09-VectorStore/utils/neo4j_interface.py +++ b/09-VectorStore/utils/neo4j_interface.py @@ -1,14 +1,10 @@ -try: - from .vectordbinterface import VectorDBInterface -except: - from vectordbinterface import VectorDBInterface import neo4j from langchain_core.documents import Document from typing import List, Union, Dict, Any, Optional from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm.auto import tqdm from hashlib import md5 -import os +import os, time METRIC = { "cosine": "COSINE", @@ -31,15 +27,16 @@ def __init__( text_node_property=None, ): if uri is None: - uri=os.environ.get("NEO4J_URI", None) + uri = os.environ.get("NEO4J_URI", None) if username is None: - username=os.environ.get("NEO4J_USERNAME", None) + username = os.environ.get("NEO4J_USERNAME", None) if password is None: - password=os.environ.get("NEO4J_PASSWORD", None) + password = os.environ.get("NEO4J_PASSWORD", None) - assert all([uri, username, password]), "You must set NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD environmental variables or initialize Neo4jDB class by pass the variables directly" + assert all( + [uri, username, password] + ), "You must set NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD environmental variables or initialize Neo4jDB class by pass the variables directly" - if embedding is not None: assert "embed_query" in dir(embedding) and "embed_documents" in dir( embedding @@ -151,7 +148,8 @@ def create_index( ) @classmethod - def _return_exist_index(cls, + def _return_exist_index( + cls, client, uri, username, @@ -163,19 +161,20 @@ def _return_exist_index(cls, metric: str = "cosine", node_label: str = "Chunk", _database: str = "neo4j", - **kwargs,): + **kwargs, + ): return cls( - uri=uri, - username=username, - password=password, - embedding=embedding, - index_name=index_name, - node_label=node_label, - _database=_database, - metric=metric, - embedding_node_property=embedding_node_property, - text_node_property=text_node_property, - ) + uri=uri, + username=username, + password=password, + embedding=embedding, + index_name=index_name, + node_label=node_label, + _database=_database, + metric=metric, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + ) @classmethod def _create_new_index( @@ -576,7 +575,7 @@ def upsert_documents(self, documents, batch_size=32, ids=None, **kwargs): return result_ids def upsert_documents_parallel( - self, documents, batch_size=32, max_workers=10, ids = None, **kwargs + self, documents, batch_size=32, max_workers=10, ids=None, **kwargs ): """Add or update documents in the vectorstore parallel. @@ -620,7 +619,8 @@ def upsert_documents_parallel( with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ - executor.submit(self.upsert_documents, batch, ids=ids) for batch, ids in zip(doc_bathces, id_batches) + executor.submit(self.upsert_documents, batch, ids=ids) + for batch, ids in zip(doc_bathces, id_batches) ] results = [] for future in tqdm( @@ -662,7 +662,6 @@ def scroll_by_query( Returns: - List of nodes if successful, else raise error """ - print(query) try: _result = self.client.execute_query(query) except Exception as e: @@ -675,14 +674,14 @@ def scroll_by_query( return result def scroll_by_filter( - self, - filters=None, - ids=None, - limit=10, - include_embedding=False, - include_meta=None, - **kwargs, - ) -> List[Dict]: + self, + filters=None, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: """Query nodes by filter or id If none of filter or id provided, will return all nodes. If this method is called directly from client without index_name set, all nodes will be returned. @@ -748,13 +747,13 @@ def scroll_by_filter( return results def scroll_by_ids( - self, - ids=None, - limit=10, - include_embedding=False, - include_meta=None, - **kwargs, - ) -> List[Dict]: + self, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: """Query nodes by filter or id If none of filter or id provided, will return all nodes. If this method is called directly from client without index_name set, all nodes will be returned. @@ -776,7 +775,7 @@ def scroll_by_ids( prefix_query = f"MATCH (n:{label})\n" else: prefix_query = "MATCH (n)\n" - + if ids is not None: if not isinstance(ids, list): ids = [ids] @@ -806,18 +805,41 @@ def scroll_by_ids( return results - def scroll_nodes(self, filters=None, ids=None, query=None, limit=10, include_embedding=False, include_meta=None, **kwargs): + def scroll_nodes( + self, + filters=None, + ids=None, + query=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ): if filters is not None: - print('by filter') - return self.scroll_by_filter(filters=filters, include_embedding=include_embedding, include_meta=include_meta, limit=limit) + print("Scroll nodes by filter") + return self.scroll_by_filter( + filters=filters, + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) elif ids is not None: - print('by ids') - return self.scroll_by_ids(ids=ids, include_embedding=include_embedding, include_meta=include_meta, limit=limit) + print("Scroll nodes by ids") + return self.scroll_by_ids( + ids=ids, + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) elif query is not None: - print('by query') + print("Scroll nodes by query") return self.scroll_by_query(query=query) else: - return self.scroll_by_filter(include_embedding=include_embedding, include_meta=include_meta, limit=limit) + return self.scroll_by_filter( + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) @staticmethod def preprocess_documents( @@ -827,7 +849,7 @@ def preprocess_documents( if use_basename: assert metadata.get("source", None) is not None, "source must be provided" - metadata['source'] = metadata['source'].split('/')[-1] + metadata["source"] = metadata["source"].split("/")[-1] result_docs = [] for idx, doc in enumerate(split_docs): From 63c1a873c80b5855d3ab990e321df2e46d2d925d Mon Sep 17 00:00:00 2001 From: XaviereKU <45342996+XaviereKU@users.noreply.github.com> Date: Sun, 19 Jan 2025 15:39:53 +0900 Subject: [PATCH 3/3] =?UTF-8?q?[N-2]=2009-VectorStore=20/=2009-Neo4j=20?= =?UTF-8?q?=EC=9C=A0=EC=82=AC=EB=8F=84=20=EA=B2=80=EC=83=89=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 09-VectorStore/09-Neo4j.ipynb | 141 +- 09-VectorStore/utils/neo4j_interface.py | 1824 ++++++++++++----------- 2 files changed, 1055 insertions(+), 910 deletions(-) diff --git a/09-VectorStore/09-Neo4j.ipynb b/09-VectorStore/09-Neo4j.ipynb index 214463a97..7ba4e5e95 100644 --- a/09-VectorStore/09-Neo4j.ipynb +++ b/09-VectorStore/09-Neo4j.ipynb @@ -47,6 +47,7 @@ "\t- [Delete items from vector store](#delete-items-from-vector-store)\n", "\t- [Scroll items from vector store](#scroll-items-from-vector-store)\n", "\t- [(Advanced)Scroll items with query](#advanced-scroll-items-with-query)\n", + "- [Similarity search](#similarity-search)\n", "\n", "### References\n", "\n", @@ -151,14 +152,14 @@ "\n", "set_env(\n", " {\n", - " \"OPENAI_API_KEY\": \"Your OpenAI API KEY\",\n", - " \"LANGCHAIN_API_KEY\": \"Your LangChain API KEY\",\n", + " \"OPENAI_API_KEY\": \"Your OpenAI API Key\",\n", + " \"LANGCHAIN_API_KEY\": \"Your LangChain API Key\",\n", " \"LANGCHAIN_TRACING_V2\": \"true\",\n", " \"LANGCHAIN_ENDPOINT\": \"https://api.smith.langchain.com\",\n", " \"LANGCHAIN_PROJECT\": \"Neo4j\",\n", - " \"NEO4J_URI\": \"Your Neo4j URI\",\n", - " \"NEO4J_USERNAME\": \"Your Neo4j Username\",\n", - " \"NEO4J_PASSWORD\": \"Your Neo4j Password\",\n", + " \"NEO4J_URI\": \"Your Neo4j Aura URI\",\n", + " \"NEO4J_USERNAME\": \"Your Neo4j Aura username\",\n", + " \"NEO4J_PASSWORD\": \"Your Neo4j Aura password\",\n", " }\n", ")" ] @@ -251,19 +252,6 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from importlib import reload\n", - "import utils.neo4j_interface as ni\n", - "\n", - "reload(ni)\n", - "Neo4jDB = ni.Neo4jDB" - ] - }, - { - "cell_type": "code", - "execution_count": 27, "metadata": { "id": "QzFkuokSnL1e" }, @@ -274,7 +262,7 @@ "text": [ "Connected to Neo4j database\n", "Connection info\n", - "URI=neo4j+s://977d36e1.databases.neo4j.io\n", + "URI=neo4j+s://3ed1167e.databases.neo4j.io\n", "username=neo4j\n", "Neo4j version is above 5.23\n" ] @@ -341,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "id": "8AgTNAl5zo3E" }, @@ -388,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "id": "tR3bb-F5hCf9" }, @@ -455,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -506,7 +494,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "id": "tRjB4EvXnoZM" }, @@ -520,12 +508,12 @@ "\n", "from langchain_openai import OpenAIEmbeddings\n", "\n", - "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\")" + "embeddings_large = OpenAIEmbeddings(model=\"text-embedding-3-large\")" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -546,7 +534,9 @@ "source": [ "# create new index\n", "index2 = client.create_index(\n", - " embedding=embeddings, index_name=\"tutorial_index_2\", node_label=\"tutorial_node_2\"\n", + " embedding=embeddings_large,\n", + " index_name=\"tutorial_index_2\",\n", + " node_label=\"tutorial_node_2\",\n", ")" ] }, @@ -570,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": { "id": "bmQI6bOsvJbu" }, @@ -578,13 +568,13 @@ "source": [ "# This is a long document we can split up.\n", "data_path = \"./data/the_little_prince.txt\"\n", - "with open(data_path) as f:\n", + "with open(data_path, encoding=\"utf8\") as f:\n", " raw_text = f.read()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -630,7 +620,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": { "id": "gGJ_bBJcw5af" }, @@ -693,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": { "id": "7cS0FHgalwPm" }, @@ -701,7 +691,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "177d94aa3ab8485cba915995106519bb", + "model_id": "826cd68e8ba547ef8726bd7b7dc33670", "version_major": 2, "version_minor": 0 }, @@ -767,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": { "id": "OWmeKCqLo3ht" }, @@ -791,7 +781,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -823,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -842,20 +832,20 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': '00936d12-fa47-4d48-bd13-51766696f942',\n", + "{'id': '8f9ed6b2-4fc5-4c23-a32b-d53acc72a68a',\n", " 'metadata': {'author': 'Saiot-Exupery',\n", " 'text': '[ Antoine de Saiot-Exupery ]',\n", " 'source': 'the_little_prince.txt',\n", " 'page': 2}}" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -873,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -895,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -924,7 +914,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -932,7 +922,7 @@ "output_type": "stream", "text": [ "Scroll nodes by filter\n", - "[{'id': 'e2f6114c-0d1c-4cd1-8ee1-b0834e48eabc', 'metadata': {'author': 'Saiot-Exupery', 'text': 'learned to fly a plane. Five years later, he would leave the military in order to begin flying air', 'source': 'the_little_prince.txt', 'page': 10}}]\n" + "[{'id': '8fcae3d1-8d41-4010-9458-6324a87c6cb4', 'metadata': {'author': 'Saiot-Exupery', 'text': 'learned to fly a plane. Five years later, he would leave the military in order to begin flying air', 'source': 'the_little_prince.txt', 'page': 10}}]\n" ] } ], @@ -947,7 +937,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -955,7 +945,7 @@ "output_type": "stream", "text": [ "Scroll nodes by ids\n", - "[{'id': '883d6d11-f484-4615-b328-8d794b65a235', 'metadata': {'author': 'Saiot-Exupery', 'text': 'For Saint-Exupéry, it was a grand adventure - one with dangers lurking at every corner. Flying his', 'source': 'the_little_prince.txt', 'page': 12}}]\n" + "[{'id': '9f4790f0-6f1b-428c-87c7-dbc3b909852a', 'metadata': {'author': 'Saiot-Exupery', 'text': 'For Saint-Exupéry, it was a grand adventure - one with dangers lurking at every corner. Flying his', 'source': 'the_little_prince.txt', 'page': 12}}]\n" ] } ], @@ -977,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1002,6 +992,63 @@ " print(item)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Similarity search\n", + "As ```Neo4j``` supports vector database, you can also do similarity search.\n", + "\n", + "The similarity is calculated by the metric you set when you created the index to search on.\n", + "\n", + "In this tutorial we will search items on **tutorial_index** , which has metric **cosine** .\n", + "\n", + "To do search, we call ```search``` method.\n", + "\n", + "You can pass the raw text(to ```query``` paramter), or embeded vector of the text(to ```embeded_query``` paramter) when calling ```search```." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RESULT BY RAW QUERY\n", + "{'text': '\"My friend the fox--\" the little prince said to me.', 'metadata': {'id': '70d75baa-3bed-4751-b0cf-98157e190756', 'author': 'Saiot-Exupery', 'source': 'the_little_prince.txt', 'page': 1087, 'embedding': None}, 'score': 0.947}\n", + "{'text': 'And the little prince asked himself:', 'metadata': {'id': '9e779e02-1d2b-4252-a8f4-78bae7866af5', 'author': 'Saiot-Exupery', 'source': 'the_little_prince.txt', 'page': 492, 'embedding': None}, 'score': 0.946}\n", + "\n", + "RESULT BY EMBEDED QUERY\n", + "{'text': '\"My friend the fox--\" the little prince said to me.', 'metadata': {'id': '70d75baa-3bed-4751-b0cf-98157e190756', 'author': 'Saiot-Exupery', 'source': 'the_little_prince.txt', 'page': 1087, 'embedding': None}, 'score': 0.947}\n", + "{'text': 'And the little prince asked himself:', 'metadata': {'id': '9e779e02-1d2b-4252-a8f4-78bae7866af5', 'author': 'Saiot-Exupery', 'source': 'the_little_prince.txt', 'page': 492, 'embedding': None}, 'score': 0.946}\n" + ] + } + ], + "source": [ + "# do search. top_k is the number of documents in the result\n", + "res_with_text = index.search(query=\"Does the little prince have a friend?\", top_k=5)\n", + "\n", + "# print out top 2 results\n", + "print(\"RESULT BY RAW QUERY\")\n", + "for i in range(2):\n", + " print(res_with_text[i])\n", + "\n", + "# embed query\n", + "embeded_query = embeddings.embed_query(\"Does the little prince have a friend?\")\n", + "\n", + "# do search with embeded vector value\n", + "res_with_embed = index.search(embeded_query=embeded_query, top_k=5)\n", + "\n", + "# print out top 2 results\n", + "print()\n", + "print(\"RESULT BY EMBEDED QUERY\")\n", + "for i in range(2):\n", + " print(res_with_embed[i])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1020,7 +1067,7 @@ "toc_visible": true }, "kernelspec": { - "display_name": "cp311", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1038,5 +1085,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 4 } diff --git a/09-VectorStore/utils/neo4j_interface.py b/09-VectorStore/utils/neo4j_interface.py index 88c6c8eb0..992ca8849 100644 --- a/09-VectorStore/utils/neo4j_interface.py +++ b/09-VectorStore/utils/neo4j_interface.py @@ -1,863 +1,961 @@ -import neo4j -from langchain_core.documents import Document -from typing import List, Union, Dict, Any, Optional -from concurrent.futures import ThreadPoolExecutor, as_completed -from tqdm.auto import tqdm -from hashlib import md5 -import os, time - -METRIC = { - "cosine": "COSINE", - "euclidean": "EUCLIDEAN", -} - - -class Neo4jDB: - def __init__( - self, - embedding=None, - uri=None, - username=None, - password=None, - index_name=None, - node_label=None, - _database="neo4j", - metric=None, - embedding_node_property=None, - text_node_property=None, - ): - if uri is None: - uri = os.environ.get("NEO4J_URI", None) - if username is None: - username = os.environ.get("NEO4J_USERNAME", None) - if password is None: - password = os.environ.get("NEO4J_PASSWORD", None) - - assert all( - [uri, username, password] - ), "You must set NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD environmental variables or initialize Neo4jDB class by pass the variables directly" - - if embedding is not None: - assert "embed_query" in dir(embedding) and "embed_documents" in dir( - embedding - ), "embedding must have have embed_query and embed_document methods.\nProvided embedding does not have both of those." - - self.uri = uri - self.username = username - self.password = password - self.embedding = embedding - self.index_name = index_name - self.node_label = node_label - self._database = _database - self.embedding_node_property = embedding_node_property - self.text_node_property = text_node_property - self.metric = metric - - try: - self.client = neo4j.GraphDatabase.driver( - uri=self.uri, auth=(self.username, self.password) - ) - except Exception as e: - print(e) - raise e - else: - self.is_neo4j_above_523 = self.check_neo4j_version() - if self.is_neo4j_above_523: - version_str = "Neo4j version is above 5.23" - else: - version_str = "Neo4j version is below 5.24" - if self.index_name is None: - print("Connected to Neo4j database") - print(f"Connection info\nURI={self.uri}\nusername={self.username}") - print(version_str) - - def check_neo4j_version(self): - db_data = self.client.execute_query("CALL dbms.components()") - version = db_data[0][0]["versions"][0] - - if "aura" in version: - version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,) - else: - version_tuple = tuple(map(int, version.split("."))) - - target_version = (5, 11, 0) - - if version_tuple < target_version: - raise ValueError( - "Version index is only supported in Neo4j version 5.11 or greater" - ) - - if version_tuple >= (5, 23, 0): - return True - else: - return False - - def connect(self) -> None: - """Connect to neo4j graph database. - If connection cannot be established, raise error - If connection established succesfully, prints connection info and return None - """ - return self.client - - def get_api_key(self): - return { - "NEO4J_URI": self.uri, - "NEO4J_USERNAME": self.username, - "NEO4J_PASSWORD": self.password, - } - - def create_index( - self, - embedding, - embedding_node_property: str = "embedding", - text_node_property: str = "text", - index_name: str = "vector", - metric: str = "cosine", - node_label: str = "Chunk", - _database: str = "neo4j", - **kwargs, - ): - if index_name in self.list_indexes(): - print(f"index {index_name} exists") - return self._return_exist_index( - self.client, - uri=self.uri, - username=self.username, - password=self.password, - embedding=embedding, - embedding_node_property=embedding_node_property, - text_node_property=text_node_property, - index_name=index_name, - metric=metric, - node_label=node_label, - _database=_database, - ) - - return self._create_new_index( - self.client, - uri=self.uri, - username=self.username, - password=self.password, - embedding=embedding, - embedding_node_property=embedding_node_property, - text_node_property=text_node_property, - index_name=index_name, - metric=metric, - node_label=node_label, - _database=_database, - ) - - @classmethod - def _return_exist_index( - cls, - client, - uri, - username, - password, - embedding, - embedding_node_property: str = "embedding", - text_node_property: str = "text", - index_name: str = "vector", - metric: str = "cosine", - node_label: str = "Chunk", - _database: str = "neo4j", - **kwargs, - ): - return cls( - uri=uri, - username=username, - password=password, - embedding=embedding, - index_name=index_name, - node_label=node_label, - _database=_database, - metric=metric, - embedding_node_property=embedding_node_property, - text_node_property=text_node_property, - ) - - @classmethod - def _create_new_index( - cls, - client, - uri, - username, - password, - embedding, - embedding_node_property: str = "embedding", - text_node_property: str = "text", - index_name: str = "vector", - metric: str = "cosine", - node_label: str = "Chunk", - _database: str = "neo4j", - **kwargs, - ): - """Create new vector index in Neo4j. - - Args: - - index_name : Index name for new index. Default is `vector` - - node_label : Node label for nodes in the index. Default is `Chunk` - - embedding_node_property : Name for embedding. Default is `embedding` - - metric : Distance used to calculate similarity. Default is `cosine`. - Supports `cosine`, `euclidean`. - - Returns: - - returns True if index is created successfully - """ - - assert ( - metric in METRIC.keys() - ), f"Choose metric among {list(METRIC.keys())}. Your metric is {metric}" - - if embedding is None and kwargs.get("dimension", None) is None: - raise ValueError( - "You must provide either embedding function or dimension of resulting vector when you encode a document with your choice of embedding function." - ) - - if "dimension" in kwargs: - dimension = kwargs["dimension"] - else: - dimension = len(embedding.embed_query("foo")) - index_name = index_name - node_label = node_label - metric = METRIC[metric] - - index_query = ( - f"CREATE VECTOR INDEX {index_name} IF NOT EXISTS " - f"FOR (m:`{node_label}`) ON m.`{embedding_node_property}` " - "OPTIONS { indexConfig: { " - "`vector.dimensions`: toInteger($embedding_dimension), " - "`vector.similarity_function`: $similarity_metric }}" - ) - - parameters = { - "embedding_dimension": dimension, - "similarity_metric": metric, - } - - try: - client.execute_query( - index_query, parameters_=parameters, database=_database - ) - except Exception as e: - print("Failed to create index") - print(e) - - else: - info_str = ( - f"Index name: {index_name}\n" - f"Node label: {node_label}\n" - f"Similarity metric: {metric}\n" - f"Embedding dimension: {dimension}\n" - f"Embedding node property: {embedding_node_property}\n" - f"Text node property: {text_node_property}\n" - ) - print("Created index information") - print(info_str) - return cls( - uri=uri, - username=username, - password=password, - embedding=embedding, - index_name=index_name, - node_label=node_label, - _database=_database, - metric=metric, - embedding_node_property=embedding_node_property, - text_node_property=text_node_property, - ) - - @classmethod - def _connect_to_index(cls, client, embedding, index_name, node_label): - return cls(index_name=index_name, embedding=embedding, node_label=node_label) - - def connect_to_index(self, index_name, embedding=None): - """Connect to existing index - Args: - - index_name: Name of index to connect - - Return: - - Neo4jDB instance - """ - query = f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" - node_label = self.client.execute_query(query).records[0]["labelsOrTypes"][0] - - if embedding is not None: - self.embedding = embedding - - return self._connect_to_index( - self.client, self.embedding, index_name, node_label - ) - - def list_indexes(self): - """Get list of index in current Neo4j database. - Returns: - - list of index names - """ - - query = """ - SHOW INDEXES - """ - - indexes = self.client.execute_query(query) - - result = [record["name"] for record in indexes.records] - - return result - - def get_index(self, index_name: str) -> Dict: - """Get information for given index name - - Args: - - index_name : index name to get information. - - Returns: - Information about the index. - """ - query = f""" - SHOW INDEXES YIELD * WHERE name='{index_name}' - """ - - try: - result = self.client.execute_query(query) - except Exception as e: - print("error occured while get index information") - raise e - else: - if len(result.records) == 0: - return None - result = {k: result.records[0][k] for k in result.keys} - return result - - def delete_index(self, index_name: str) -> Union[bool, None]: - """Delete index - - Args: - - index_name : index name to delete. - - Returns: - True if index deleted successfully. - If error occured, will raise error. - """ - query = f"DROP INDEX {index_name}" - if self.get_index(index_name) is None: - return f"{index_name} does not exists" - - try: - self.client.execute_query(query) - except Exception as e: - print(f"Drop index {index_name} failed") - raise e - else: - return True - - # Query related functions - def query(self, index_name, query_vector=None, top_k=10, **kwargs): - pass - - def delete_node( - self, - index_name: str = None, - filters: List[Dict] = None, - ids: List = None, - **kwargs, - ) -> bool: - """Delete nodes by filter - One of filters or ids must be provided, but not both. - - Args: - - index_name: index of nodes to delete - - filters: Delete nodes matching these filters - - ids: Delete nodes matching these ids - - Returns: - - True if deletion was successful - - raise error if deletion failed - """ - if filters is None and ids is None: - raise AssertionError("You must provide one of filters or ids") - elif filters is not None and ids is not None: - raise AssertionError("You must provide only one of filters or ids") - - if filters is not None: - return self.delete_by_filter(index_name, filters) - elif ids is not None: - return self.delete_by_id(index_name, ids) - - def delete_by_id(self, index_name: str = None, ids: List = None, **kwargs) -> bool: - """Delete nodes by filter - One of filters or ids must be provided, but not both. - - Args: - - index_name: index of nodes to delete - - ids: Delete nodes matching these ids - - Returns: - - True if deletion was successful - - raise error if deletion failed - """ - - if index_name is not None: - label = self.client.execute_query( - f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" - ).records[0]["labelsOrTypes"][0] - prefix_query = f"MATCH (n:{label})\n" - else: - prefix_query = "MATCH (n)\n" - - if ids is not None: - if not isinstance(ids, list): - ids = [ids] - filter_query = f"n.id IN {ids}" - - query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" - - try: - self.client.execute_query(query) - except Exception as e: - print("Delete by filter failed") - raise e - else: - return True - - def delete_by_filter( - self, index_name: str = None, filters: Dict = None, **kwargs - ) -> bool: - """Delete nodes by filter - One of filters or ids must be provided, but not both. - - Args: - - index_name: index of nodes to delete - - filters: Delete nodes matching these filters - - Returns: - - True if deletion was successful - - raise error if deletion failed - """ - - if index_name is not None: - label = self.client.execute_query( - f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" - ).records[0]["labelsOrTypes"][0] - prefix_query = f"MATCH (n:{label})\n" - else: - prefix_query = "MATCH (n)\n" - - if filters is not None: - filter_queries = [] - for k, v in filters.items(): - if not isinstance(v, list): - v = [v] - filter_queries.append(f"n.{k} IN {v}") - filter_query = " AND ".join(filter_queries) - - query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" - - try: - self.client.execute_query(query) - except Exception as e: - print("Delete by filter failed") - raise e - else: - return True - - # Document upsert related functions - - def add_embedding(self, documents: list[Document], ids: list[str] = []) -> list: - """Encode documents - Args: - - documents: List of documents to upsert into the vectorstore - - ids: List of ids for each documents. If not provided, md5 hash function will created based on the text of each document. - - Returns: Returns (encoded_vectors, id, metadata) tuple list - """ - - texts = [doc.page_content for doc in documents] - - if not all(ids): - ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] - - metadatas = [doc.metadata for doc in documents] - - encoded = self.embedding.embed_documents(texts) - - return (texts, encoded, ids, metadatas) - - def _insert_documents(self, documents: list[Document], ids: list[str] = []): - """util function for upsert_document. - - Args: - - documents: List of Document to upsert to database - - ids: List of ids paired with documents. If not provided will be created by md5 hash function. - - Return: - - ids: List of ids upserted documents. If ids were provided this must be the same to the ids provided. - """ - - texts, encodes, ids, metadatas = self.add_embedding(documents, ids) - - parameters = { - "data": [ - {"text": text, "metadata": metadata, "embedding": encode, "id": id} - for text, metadata, encode, id in zip(texts, metadatas, encodes, ids) - ] - } - - if self.is_neo4j_above_523: - call_prefix = "CALL (row) { " - else: - call_prefix = "CALL { WITH row " - - import_query = ( - "UNWIND $data AS row " - f"{call_prefix}" - f"MERGE (c:`{self.node_label}` {{id: row.id}}) " - "WITH c, row " - f"CALL db.create.setNodeVectorProperty(c, " - f"'{self.embedding_node_property}', row.embedding) " - f"SET c.`{self.text_node_property}` = row.text " - "SET c += row.metadata " - "} IN TRANSACTIONS OF 1000 ROWS " - ) - try: - self.client.execute_query(import_query, parameters_=parameters) - except Exception as e: - if "can only be executed in an implicit transaction" in str(e): - self.client.session().run(neo4j.Query(text=import_query), parameters) - elif "failed to obtain a connection from the pool" in str(e): - time.sleep(10) - self.client.session().run(neo4j.Query(text=import_query), parameters) - - return ids - - def upsert_documents(self, documents, batch_size=32, ids=None, **kwargs): - """Upsert documents into the vectorstore - - Args: - - documents: List of documents to upsert into the vectorstore - - batch_size: Batch size of documents to add or update. Default is 32. - - kwargs: Additional keyword arguments. - if kwargs contains ids and documents contain ids, - the ids in the kwargs will receive precedence. - - Returns: - Returns list of ids of the documents upserted. - """ - assert self.index_name is not None, "You MUST connect to index first." - - if self.node_label is None and self.index_name is not None: - self.node_label = self.client.execute_query( - f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" - ).records[0]["labelsOrTypes"][0] - - if ids is not None: - assert len(ids) == len( - documents - ), "Size of documents and ids must be the same" - - else: - ids = [False] * len(documents) - - id_batches = [ids[i : i + batch_size] for i in range(0, len(ids), batch_size)] - - if batch_size > len(documents): - batch_size = len(documents) - - doc_batches = [ - documents[i : i + batch_size] for i in range(0, len(documents), batch_size) - ] - - result_ids = [] - - for doc_batch, id_batch in zip(doc_batches, id_batches): - result_ids.extend(self._insert_documents(doc_batch, id_batch)) - - return result_ids - - def upsert_documents_parallel( - self, documents, batch_size=32, max_workers=10, ids=None, **kwargs - ): - """Add or update documents in the vectorstore parallel. - - Args: - documents: Documents to add to the vectorstore. - batch_size: Batch size of documents to add or update. - Default is 32. - max_workers: Number of threads to use. - Default is 10. - kwargs: Additional keyword arguments. - if kwargs contains ids and documents contain ids, - the ids in the kwargs will receive precedence. - - Returns: - List of IDs of the added texts. - - Raises: - ValueError: If the number of ids does not match the number of documents. - """ - assert self.index_name is not None, "You MUST connect to index first." - - if ids is not None: - assert len(ids) == len( - documents - ), "Size of documents and ids must be the same" - - if batch_size > len(documents): - batch_size = len(documents) - - doc_bathces = [ - documents[i : i + batch_size] for i in range(0, len(documents), batch_size) - ] - id_batches = [ - ids[i : i + batch_size] for i in range(0, len(documents), batch_size) - ] - - if self.node_label is None and self.index_name is not None: - self.node_label = self.client.execute_query( - f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" - ).records[0]["labelsOrTypes"][0] - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [ - executor.submit(self.upsert_documents, batch, ids=ids) - for batch, ids in zip(doc_bathces, id_batches) - ] - results = [] - for future in tqdm( - as_completed(futures), total=len(futures), desc="Upserting documents..." - ): - result = future.result() - if result: - results.extend(result) - - return results - - def delete_by_query( - self, query: str, index_name: Optional[str | None] = None, **kwrags - ) -> bool: - """Delete nodes by query - Args: - - query: Cypher query - - index_name: Optional. Default is None. If specified, will delete node only in the given index - - Returns: - - True if deletion is successful else raise error - """ - try: - self.client.execute_query(query) - except Exception as e: - print(f"Error {e} occured during deletion") - raise e - else: - return True - - def scroll_by_query( - self, query: str, index_name: Optional[str | None] = None, **kwargs - ) -> List: - """Scroll nodes by query - Args: - - query: Cypher query - - index_name: Optional. Default is None. If specified, will scroll node only in the given index - - Returns: - - List of nodes if successful, else raise error - """ - try: - _result = self.client.execute_query(query) - except Exception as e: - print(f"Error {e} occured during scroll") - raise e - else: - result = [] - for record in _result.records: - result.append({k: record[k] for k in record.keys()}) - return result - - def scroll_by_filter( - self, - filters=None, - ids=None, - limit=10, - include_embedding=False, - include_meta=None, - **kwargs, - ) -> List[Dict]: - """Query nodes by filter or id - If none of filter or id provided, will return all nodes. - If this method is called directly from client without index_name set, all nodes will be returned. - - Args: - - filters: filter for query data - - ids: id for query data - - limit: number of nodes to return - - include_embedding: Set True to include embedded vector to result. Default is False - - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. - - Returns: - - list of nodes - """ - - if self.index_name is not None: - label = self.client.execute_query( - f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" - ).records[0]["labelsOrTypes"][0] - prefix_query = f"MATCH (n:{label})\n" - else: - prefix_query = "MATCH (n)\n" - - if filters is None and ids is None: - filter_query = "" - - elif filters is not None: - filter_queries = [] - for k, v in filters.items(): - if not isinstance(v, list): - v = [v] - filter_queries.append(f"n.{k} IN {v}") - filter_query = " AND ".join(filter_queries) - - elif ids is not None: - filter_queries = [] - filter_query = f"n.id IN {ids}" - - limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" - - if filter_query != "": - query = prefix_query + " WHERE " + filter_query + limit_query - else: - query = prefix_query + limit_query - - _results = self.client.execute_query(query) - - results = list() - - for _result in _results.records: - node = _result["n"] - - result = {"id": node["id"]} - if include_embedding: - result.update({"embedding": node["embedding"]}) - if include_meta is None: - include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] - result.update( - {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} - ) - results.append(result) - - return results - - def scroll_by_ids( - self, - ids=None, - limit=10, - include_embedding=False, - include_meta=None, - **kwargs, - ) -> List[Dict]: - """Query nodes by filter or id - If none of filter or id provided, will return all nodes. - If this method is called directly from client without index_name set, all nodes will be returned. - - Args: - - ids: id for query data - - limit: number of nodes to return - - include_embedding: Set True to include embedded vector to result. Default is False - - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. - - Returns: - - list of nodes - """ - - if self.index_name is not None: - label = self.client.execute_query( - f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" - ).records[0]["labelsOrTypes"][0] - prefix_query = f"MATCH (n:{label})\n" - else: - prefix_query = "MATCH (n)\n" - - if ids is not None: - if not isinstance(ids, list): - ids = [ids] - filter_queries = [] - filter_query = f"n.id IN {ids}" - - limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" - - query = prefix_query + " WHERE " + filter_query + limit_query - - _results = self.client.execute_query(query) - - results = list() - - for _result in _results.records: - node = _result["n"] - - result = {"id": node["id"]} - if include_embedding: - result.update({"embedding": node["embedding"]}) - if include_meta is None: - include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] - result.update( - {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} - ) - results.append(result) - - return results - - def scroll_nodes( - self, - filters=None, - ids=None, - query=None, - limit=10, - include_embedding=False, - include_meta=None, - **kwargs, - ): - if filters is not None: - print("Scroll nodes by filter") - return self.scroll_by_filter( - filters=filters, - include_embedding=include_embedding, - include_meta=include_meta, - limit=limit, - ) - elif ids is not None: - print("Scroll nodes by ids") - return self.scroll_by_ids( - ids=ids, - include_embedding=include_embedding, - include_meta=include_meta, - limit=limit, - ) - elif query is not None: - print("Scroll nodes by query") - return self.scroll_by_query(query=query) - else: - return self.scroll_by_filter( - include_embedding=include_embedding, - include_meta=include_meta, - limit=limit, - ) - - @staticmethod - def preprocess_documents( - split_docs, metadata_keys, min_length, use_basename=False, **kwargs - ): - metadata = kwargs - - if use_basename: - assert metadata.get("source", None) is not None, "source must be provided" - metadata["source"] = metadata["source"].split("/")[-1] - - result_docs = [] - for idx, doc in enumerate(split_docs): - if len(doc.page_content) < min_length: - continue - for k in metadata_keys: - doc.metadata.update({k: metadata.get(k, "")}) - doc.metadata.update({"page": idx + 1}) - result_docs.append(doc) - - return result_docs +import neo4j +from langchain_core.documents import Document +from typing import List, Union, Dict, Any, Optional +from concurrent.futures import ThreadPoolExecutor, as_completed +from tqdm.auto import tqdm +from hashlib import md5 +import os, time + +METRIC = { + "cosine": "COSINE", + "euclidean": "EUCLIDEAN", +} + + +class Neo4jDB: + def __init__( + self, + embedding=None, + uri=None, + username=None, + password=None, + index_name=None, + node_label=None, + _database="neo4j", + metric=None, + embedding_node_property=None, + text_node_property=None, + dimension=None, + ): + if uri is None: + uri = os.environ.get("NEO4J_URI", None) + if username is None: + username = os.environ.get("NEO4J_USERNAME", None) + if password is None: + password = os.environ.get("NEO4J_PASSWORD", None) + + assert all( + [uri, username, password] + ), "You must set NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD environmental variables or initialize Neo4jDB class by pass the variables directly" + + if embedding is not None: + assert "embed_query" in dir(embedding) and "embed_documents" in dir( + embedding + ), "embedding must have have embed_query and embed_document methods.\nProvided embedding does not have both of those." + + self.uri = uri + self.username = username + self.password = password + self.embedding = embedding + self.index_name = index_name + self.node_label = node_label + self._database = _database + self.embedding_node_property = embedding_node_property + self.text_node_property = text_node_property + self.metric = metric + self.dimension = dimension + + try: + self.client = neo4j.GraphDatabase.driver( + uri=self.uri, auth=(self.username, self.password) + ) + except Exception as e: + print(e) + raise e + else: + self.is_neo4j_above_523 = self.check_neo4j_version() + if self.is_neo4j_above_523: + version_str = "Neo4j version is above 5.23" + else: + version_str = "Neo4j version is below 5.24" + if self.index_name is None: + print("Connected to Neo4j database") + print(f"Connection info\nURI={self.uri}\nusername={self.username}") + print(version_str) + + def check_neo4j_version(self): + db_data = self.client.execute_query("CALL dbms.components()") + version = db_data[0][0]["versions"][0] + + if "aura" in version: + version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,) + else: + version_tuple = tuple(map(int, version.split("."))) + + target_version = (5, 11, 0) + + if version_tuple < target_version: + raise ValueError( + "Version index is only supported in Neo4j version 5.11 or greater" + ) + + if version_tuple >= (5, 23, 0): + return True + else: + return False + + def connect(self) -> None: + """Connect to neo4j graph database. + If connection cannot be established, raise error + If connection established succesfully, prints connection info and return None + """ + return self.client + + def get_api_key(self): + return { + "NEO4J_URI": self.uri, + "NEO4J_USERNAME": self.username, + "NEO4J_PASSWORD": self.password, + } + + def create_index( + self, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + if index_name in self.list_indexes(): + print(f"index {index_name} exists") + return self._return_exist_index( + self.client, + uri=self.uri, + username=self.username, + password=self.password, + embedding=embedding, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + index_name=index_name, + metric=metric, + node_label=node_label, + _database=_database, + ) + + return self._create_new_index( + self.client, + uri=self.uri, + username=self.username, + password=self.password, + embedding=embedding, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + index_name=index_name, + metric=metric, + node_label=node_label, + _database=_database, + ) + + @classmethod + def _return_exist_index( + cls, + client, + uri, + username, + password, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + query = f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes, properties" + info = client.execute_query(query).records[0] + node_label = info["labelsOrTypes"][0] + embedding_node_property = info["properties"][0] + return cls( + uri=uri, + username=username, + password=password, + embedding=embedding, + index_name=index_name, + node_label=node_label, + _database=_database, + metric=metric, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + ) + + @classmethod + def _create_new_index( + cls, + client, + uri, + username, + password, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + """Create new vector index in Neo4j. + + Args: + - index_name : Index name for new index. Default is `vector` + - node_label : Node label for nodes in the index. Default is `Chunk` + - embedding_node_property : Name for embedding. Default is `embedding` + - metric : Distance used to calculate similarity. Default is `cosine`. + Supports `cosine`, `euclidean`. + + Returns: + - returns True if index is created successfully + """ + + assert ( + metric in METRIC.keys() + ), f"Choose metric among {list(METRIC.keys())}. Your metric is {metric}" + + if embedding is None and kwargs.get("dimension", None) is None: + raise ValueError( + "You must provide either embedding function or dimension of resulting vector when you encode a document with your choice of embedding function." + ) + + if "dimension" in kwargs: + dimension = kwargs["dimension"] + else: + dimension = len(embedding.embed_query("foo")) + index_name = index_name + node_label = node_label + metric = METRIC[metric] + + index_query = ( + f"CREATE VECTOR INDEX {index_name} IF NOT EXISTS " + f"FOR (m:`{node_label}`) ON m.`{embedding_node_property}` " + "OPTIONS { indexConfig: { " + "`vector.dimensions`: toInteger($embedding_dimension), " + "`vector.similarity_function`: $similarity_metric }}" + ) + + parameters = { + "embedding_dimension": dimension, + "similarity_metric": metric, + } + + try: + client.execute_query( + index_query, parameters_=parameters, database=_database + ) + except Exception as e: + print("Failed to create index") + print(e) + + else: + info_str = ( + f"Index name: {index_name}\n" + f"Node label: {node_label}\n" + f"Similarity metric: {metric}\n" + f"Embedding dimension: {dimension}\n" + f"Embedding node property: {embedding_node_property}\n" + f"Text node property: {text_node_property}\n" + ) + print("Created index information") + print(info_str) + return cls( + uri=uri, + username=username, + password=password, + embedding=embedding, + index_name=index_name, + node_label=node_label, + _database=_database, + metric=metric, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + dimension=dimension, + ) + + @classmethod + def _connect_to_index(cls, client, embedding, index_name, node_label): + return cls(index_name=index_name, embedding=embedding, node_label=node_label) + + def connect_to_index(self, index_name, embedding=None): + """Connect to existing index + Args: + - index_name: Name of index to connect + + Return: + - Neo4jDB instance + """ + query = f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + node_label = self.client.execute_query(query).records[0]["labelsOrTypes"][0] + + if embedding is not None: + self.embedding = embedding + + return self._connect_to_index( + self.client, self.embedding, index_name, node_label + ) + + def list_indexes(self): + """Get list of index in current Neo4j database. + Returns: + - list of index names + """ + + query = """ + SHOW INDEXES + """ + + indexes = self.client.execute_query(query) + + result = [record["name"] for record in indexes.records] + + return result + + def get_index(self, index_name: str) -> Dict: + """Get information for given index name + + Args: + - index_name : index name to get information. + + Returns: + Information about the index. + """ + query = f""" + SHOW INDEXES YIELD * WHERE name='{index_name}' + """ + + try: + result = self.client.execute_query(query) + except Exception as e: + print("error occured while get index information") + raise e + else: + if len(result.records) == 0: + return None + result = {k: result.records[0][k] for k in result.keys} + return result + + def delete_index(self, index_name: str) -> Union[bool, None]: + """Delete index + + Args: + - index_name : index name to delete. + + Returns: + True if index deleted successfully. + If error occured, will raise error. + """ + query = f"DROP INDEX {index_name}" + if self.get_index(index_name) is None: + return f"{index_name} does not exists" + + try: + self.client.execute_query(query) + except Exception as e: + print(f"Drop index {index_name} failed") + raise e + else: + return True + + # Query related functions + def query(self, index_name, query_vector=None, top_k=10, **kwargs): + pass + + def delete_node( + self, + index_name: str = None, + filters: List[Dict] = None, + ids: List = None, + **kwargs, + ) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - filters: Delete nodes matching these filters + - ids: Delete nodes matching these ids + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + if filters is None and ids is None: + raise AssertionError("You must provide one of filters or ids") + elif filters is not None and ids is not None: + raise AssertionError("You must provide only one of filters or ids") + + if filters is not None: + return self.delete_by_filter(index_name, filters) + elif ids is not None: + return self.delete_by_id(index_name, ids) + + def delete_by_id(self, index_name: str = None, ids: List = None, **kwargs) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - ids: Delete nodes matching these ids + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + + if index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if ids is not None: + if not isinstance(ids, list): + ids = [ids] + filter_query = f"n.id IN {ids}" + + query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" + + try: + self.client.execute_query(query) + except Exception as e: + print("Delete by filter failed") + raise e + else: + return True + + def delete_by_filter( + self, index_name: str = None, filters: Dict = None, **kwargs + ) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - filters: Delete nodes matching these filters + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + + if index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if filters is not None: + filter_queries = [] + for k, v in filters.items(): + if not isinstance(v, list): + v = [v] + filter_queries.append(f"n.{k} IN {v}") + filter_query = " AND ".join(filter_queries) + + query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" + + try: + self.client.execute_query(query) + except Exception as e: + print("Delete by filter failed") + raise e + else: + return True + + # Document upsert related functions + + def add_embedding(self, documents: list[Document], ids: list[str] = []) -> list: + """Encode documents + Args: + - documents: List of documents to upsert into the vectorstore + - ids: List of ids for each documents. If not provided, md5 hash function will created based on the text of each document. + + Returns: Returns (encoded_vectors, id, metadata) tuple list + """ + + texts = [doc.page_content for doc in documents] + + if not all(ids): + ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] + + metadatas = [doc.metadata for doc in documents] + + encoded = self.embedding.embed_documents(texts) + + return (texts, encoded, ids, metadatas) + + def _insert_documents(self, documents: list[Document], ids: list[str] = []): + """util function for upsert_document. + + Args: + - documents: List of Document to upsert to database + - ids: List of ids paired with documents. If not provided will be created by md5 hash function. + + Return: + - ids: List of ids upserted documents. If ids were provided this must be the same to the ids provided. + """ + + texts, encodes, ids, metadatas = self.add_embedding(documents, ids) + + parameters = { + "data": [ + {"text": text, "metadata": metadata, "embedding": encode, "id": id} + for text, metadata, encode, id in zip(texts, metadatas, encodes, ids) + ] + } + + if self.is_neo4j_above_523: + call_prefix = "CALL (row) { " + else: + call_prefix = "CALL { WITH row " + + import_query = ( + "UNWIND $data AS row " + f"{call_prefix}" + f"MERGE (c:`{self.node_label}` {{id: row.id}}) " + "WITH c, row " + f"CALL db.create.setNodeVectorProperty(c, " + f"'{self.embedding_node_property}', row.embedding) " + f"SET c.`{self.text_node_property}` = row.text " + "SET c += row.metadata " + "} IN TRANSACTIONS OF 1000 ROWS " + ) + try: + self.client.execute_query(import_query, parameters_=parameters) + except Exception as e: + if "can only be executed in an implicit transaction" in str(e): + self.client.session().run(neo4j.Query(text=import_query), parameters) + elif "failed to obtain a connection from the pool" in str(e): + time.sleep(10) + self.client.session().run(neo4j.Query(text=import_query), parameters) + + return ids + + def upsert_documents(self, documents, batch_size=32, ids=None, **kwargs): + """Upsert documents into the vectorstore + + Args: + - documents: List of documents to upsert into the vectorstore + - batch_size: Batch size of documents to add or update. Default is 32. + - kwargs: Additional keyword arguments. + if kwargs contains ids and documents contain ids, + the ids in the kwargs will receive precedence. + + Returns: + Returns list of ids of the documents upserted. + """ + assert self.index_name is not None, "You MUST connect to index first." + + if self.node_label is None and self.index_name is not None: + self.node_label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + + if ids is not None: + assert len(ids) == len( + documents + ), "Size of documents and ids must be the same" + + else: + ids = [False] * len(documents) + + id_batches = [ids[i : i + batch_size] for i in range(0, len(ids), batch_size)] + + if batch_size > len(documents): + batch_size = len(documents) + + doc_batches = [ + documents[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + + result_ids = [] + + for doc_batch, id_batch in zip(doc_batches, id_batches): + result_ids.extend(self._insert_documents(doc_batch, id_batch)) + + return result_ids + + def upsert_documents_parallel( + self, documents, batch_size=32, max_workers=10, ids=None, **kwargs + ): + """Add or update documents in the vectorstore parallel. + + Args: + documents: Documents to add to the vectorstore. + batch_size: Batch size of documents to add or update. + Default is 32. + max_workers: Number of threads to use. + Default is 10. + kwargs: Additional keyword arguments. + if kwargs contains ids and documents contain ids, + the ids in the kwargs will receive precedence. + + Returns: + List of IDs of the added texts. + + Raises: + ValueError: If the number of ids does not match the number of documents. + """ + assert self.index_name is not None, "You MUST connect to index first." + + if ids is not None: + assert len(ids) == len( + documents + ), "Size of documents and ids must be the same" + + if batch_size > len(documents): + batch_size = len(documents) + + doc_bathces = [ + documents[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + id_batches = [ + ids[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + + if self.node_label is None and self.index_name is not None: + self.node_label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(self.upsert_documents, batch, ids=ids) + for batch, ids in zip(doc_bathces, id_batches) + ] + results = [] + for future in tqdm( + as_completed(futures), total=len(futures), desc="Upserting documents..." + ): + result = future.result() + if result: + results.extend(result) + + return results + + def delete_by_query( + self, query: str, index_name: Optional[str | None] = None, **kwrags + ) -> bool: + """Delete nodes by query + Args: + - query: Cypher query + - index_name: Optional. Default is None. If specified, will delete node only in the given index + + Returns: + - True if deletion is successful else raise error + """ + try: + self.client.execute_query(query) + except Exception as e: + print(f"Error {e} occured during deletion") + raise e + else: + return True + + def scroll_by_query( + self, query: str, index_name: Optional[str | None] = None, **kwargs + ) -> List: + """Scroll nodes by query + Args: + - query: Cypher query + - index_name: Optional. Default is None. If specified, will scroll node only in the given index + + Returns: + - List of nodes if successful, else raise error + """ + try: + _result = self.client.execute_query(query) + except Exception as e: + print(f"Error {e} occured during scroll") + raise e + else: + result = [] + for record in _result.records: + result.append({k: record[k] for k in record.keys()}) + return result + + def scroll_by_filter( + self, + filters=None, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: + """Query nodes by filter or id + If none of filter or id provided, will return all nodes. + If this method is called directly from client without index_name set, all nodes will be returned. + + Args: + - filters: filter for query data + - ids: id for query data + - limit: number of nodes to return + - include_embedding: Set True to include embedded vector to result. Default is False + - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. + + Returns: + - list of nodes + """ + + if self.index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if filters is None and ids is None: + filter_query = "" + + elif filters is not None: + filter_queries = [] + for k, v in filters.items(): + if not isinstance(v, list): + v = [v] + filter_queries.append(f"n.{k} IN {v}") + filter_query = " AND ".join(filter_queries) + + elif ids is not None: + filter_queries = [] + filter_query = f"n.id IN {ids}" + + limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" + + if filter_query != "": + query = prefix_query + " WHERE " + filter_query + limit_query + else: + query = prefix_query + limit_query + + _results = self.client.execute_query(query) + + results = list() + + for _result in _results.records: + node = _result["n"] + + result = {"id": node["id"]} + if include_embedding: + result.update({"embedding": node["embedding"]}) + if include_meta is None: + include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] + result.update( + {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} + ) + results.append(result) + + return results + + def scroll_by_ids( + self, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: + """Query nodes by filter or id + If none of filter or id provided, will return all nodes. + If this method is called directly from client without index_name set, all nodes will be returned. + + Args: + - ids: id for query data + - limit: number of nodes to return + - include_embedding: Set True to include embedded vector to result. Default is False + - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. + + Returns: + - list of nodes + """ + + if self.index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if ids is not None: + if not isinstance(ids, list): + ids = [ids] + filter_query = f"n.id IN {ids}" + + limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" + + query = prefix_query + " WHERE " + filter_query + limit_query + + _results = self.client.execute_query(query) + + results = list() + + for _result in _results.records: + node = _result["n"] + + result = {"id": node["id"]} + if include_embedding: + result.update({"embedding": node["embedding"]}) + if include_meta is None: + include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] + result.update( + {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} + ) + results.append(result) + + return results + + def scroll_nodes( + self, + filters=None, + ids=None, + query=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ): + if filters is not None: + print("Scroll nodes by filter") + return self.scroll_by_filter( + filters=filters, + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) + elif ids is not None: + print("Scroll nodes by ids") + return self.scroll_by_ids( + ids=ids, + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) + elif query is not None: + print("Scroll nodes by query") + return self.scroll_by_query(query=query) + else: + return self.scroll_by_filter( + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) + + @staticmethod + def preprocess_documents( + split_docs, metadata_keys, min_length, use_basename=False, **kwargs + ): + metadata = kwargs + + if use_basename: + assert metadata.get("source", None) is not None, "source must be provided" + metadata["source"] = metadata["source"].split("/")[-1] + + result_docs = [] + for idx, doc in enumerate(split_docs): + if len(doc.page_content) < min_length: + continue + for k in metadata_keys: + doc.metadata.update({k: metadata.get(k, "")}) + doc.metadata.update({"page": idx + 1}) + result_docs.append(doc) + + return result_docs + + def search( + self, + query=None, + embeded_query=None, + index_name=None, + filters=[], + with_score=False, + top_k=3, + **kwargs, + ): + assert self.index_name is not None, "You must provide index name" + + if query is None and embeded_query is None: + raise ValueError("You must provide either query or embeded values of query") + + if query is not None and embeded_query is not None: + print( + "Both query and embeded value of query passed. Using embded value of query" + ) + + if embeded_query is None: + embeded_query = self.embedding.embed_query(query) + + if kwargs.get("include_vector"): + result_query = ( + f"MATCH (n:`{self.node_label}`) " + f"WITH n, vector.similarity.cosine($embeded, n.embedding) AS score " + f"ORDER BY score DESC " + f"RETURN r, score LIMIT $k " + f"n {{.*, `{self.text_node_property}`: Null, `{self.embedding_node_property}`: Null}} AS metadata LIMIT $k " + ) + else: + result_query = ( + f"MATCH (n:`{self.node_label}`) " + f"WITH n, vector.similarity.cosine($embeded, n.embedding) AS score " + f"ORDER BY score DESC " + f"RETURN score, " + f"n {{.*, `{self.embedding_node_property}`: Null}} AS metadata LIMIT $k " + ) + + parameters = { + "k": top_k, + "embeded": embeded_query, + } + + try: + _result = self.client.execute_query(result_query, parameters_=parameters) + except: + _result = self.client.session(database=self._database).run( + neo4j.Query(text=result_query), parameters + ) + + result = [] + for _r in _result.records: + result.append( + { + "text": _r["metadata"].pop("text"), + "metadata": _r["metadata"], + "score": round(float(_r["score"]), 3), + } + ) + + return result + + @staticmethod + def remove_lucene_chars(text: str) -> str: + """Remove Lucene special characters""" + special_chars = [ + "+", + "-", + "&", + "|", + "!", + "(", + ")", + "{", + "}", + "[", + "]", + "^", + '"', + "~", + "*", + "?", + ":", + "\\", + ] + for char in special_chars: + if char in text: + text = text.replace(char, " ") + return text.strip()