diff --git a/09-VectorStore/09-Neo4j.ipynb b/09-VectorStore/09-Neo4j.ipynb new file mode 100644 index 000000000..7ba4e5e95 --- /dev/null +++ b/09-VectorStore/09-Neo4j.ipynb @@ -0,0 +1,1089 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "z47qcQPDHZq_" + }, + "source": [ + "# Neo4j Vector Index\n", + "\n", + "- Author: [Jongho](https://github.com/XaviereKU)\n", + "- Design: \n", + "- Peer Review: \n", + "- This is a part of [LangChain Open Tutorial](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial)\n", + "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb) [![Open in GitHub](https://img.shields.io/badge/Open%20in%20GitHub-181717?style=flat-square&logo=github&logoColor=white)](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/99-TEMPLATE/00-BASE-TEMPLATE-EXAMPLE.ipynb)\n", + "\n", + "## Overview\n", + "Neo4j is a Graph database backed by vector store and can be deployed locally or on cloud.\n", + "\n", + "In this tutorial we utilize its ability to store vectors only, and deal with its real ability, Graph database, later.\n", + "\n", + "To encode data into vector, we use ```OpenAIEmbedding```, but you can use any embedding you want.\n", + "\n", + "Furthermore, you need to note that you should read about ```Cypher```, declarative query language for Neo4j, to fully utilize Neo4j.\n", + "\n", + "We use some Cypher queries but will not go deeply. You can visit Cypher official document web site in References.\n", + "\n", + "For more information, visit [Neo4j](https://neo4j.com/).\n", + "\n", + "### Table of Contents\n", + "\n", + "- [Overview](#overview)\n", + "- [Environment Setup](#environment-setup)\n", + "- [Setup Neo4j](#setup-neo4j)\n", + "\t- [Getting started with Aura](#getting-started-with-aura)\n", + "\t- [Getting started with Docker](#getting-started-with-docker)\n", + "- [Credentials](#credentials)\n", + "- [Initialization](#initialization)\n", + "\t- [List Indexes](#list-indexs)\n", + "\t- [Create Index](#create-index)\n", + "\t- [Delete Index](#delete-index)\n", + "\t- [Select Embedding model](#select-embeddings-model)\n", + "\t- [Data Preprocessing](#data-preprocessing)\n", + "- [Manage vector store](#manage-vector-store)\n", + "\t- [Add items to vector store](#add-items-to-vector-store)\n", + "\t- [Delete items from vector store](#delete-items-from-vector-store)\n", + "\t- [Scroll items from vector store](#scroll-items-from-vector-store)\n", + "\t- [(Advanced)Scroll items with query](#advanced-scroll-items-with-query)\n", + "- [Similarity search](#similarity-search)\n", + "\n", + "### References\n", + "\n", + "- [Cypher](https://neo4j.com/docs/cypher-manual/current/introduction/)\n", + "- [Neo4j Docker Installation](https://hub.docker.com/_/neo4j)\n", + "- [Neo4j Official Installation guide](https://neo4j.com/docs/operations-manual/current/installation/)\n", + "- [Neo4j Python SDK document](https://neo4j.com/docs/api/python-driver/current/index.html)\n", + "- [Neo4j document](https://neo4j.com/docs/)\n", + "- [Langchain Neo4j document](https://python.langchain.com/docs/integrations/vectorstores/neo4jvector/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wEk1SoFEfwjo" + }, + "source": [ + "## Environment Setup\n", + "\n", + "Set up the environment. You may refer to [Environment Setup](https://wikidocs.net/257836) for more details.\n", + "\n", + "**[Note]**\n", + "- ```langchain-opentutorial``` is a package that provides a set of easy-to-use environment setup, useful functions and utilities for tutorials. \n", + "- You can checkout the [```langchain-opentutorial```](https://github.com/LangChain-OpenTutorial/langchain-opentutorial-pypi) for more details.\n", + "- We built ```Neo4jDB``` class from Python SDK of ```Neo4j```. Langchain also supports neo4j vector store class but it lacks some methods like delete. Look neo4j_interface.py in utils" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "n9NVKk-Zf9Nq" + }, + "outputs": [], + "source": [ + "%%capture --no-stderr\n", + "%pip install langchain-opentutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Pip install necessary package\n", + "%pip install -qU neo4j" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "IMx2hZNXf9QL" + }, + "outputs": [], + "source": [ + "# Install required packages\n", + "from langchain_opentutorial import package\n", + "\n", + "package.install(\n", + " [\n", + " \"langsmith\",\n", + " \"langchain\",\n", + " \"langchain_core\",\n", + " \"langchain_community\",\n", + " \"langchain_openai\",\n", + " \"neo4j\",\n", + " \"nltk\",\n", + " ],\n", + " verbose=False,\n", + " upgrade=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "N8C6pLTZf9Sb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Environment variables have been set successfully.\n" + ] + } + ], + "source": [ + "# Set environment variables\n", + "from langchain_opentutorial import set_env\n", + "\n", + "set_env(\n", + " {\n", + " \"OPENAI_API_KEY\": \"Your OpenAI API Key\",\n", + " \"LANGCHAIN_API_KEY\": \"Your LangChain API Key\",\n", + " \"LANGCHAIN_TRACING_V2\": \"true\",\n", + " \"LANGCHAIN_ENDPOINT\": \"https://api.smith.langchain.com\",\n", + " \"LANGCHAIN_PROJECT\": \"Neo4j\",\n", + " \"NEO4J_URI\": \"Your Neo4j Aura URI\",\n", + " \"NEO4J_USERNAME\": \"Your Neo4j Aura username\",\n", + " \"NEO4J_PASSWORD\": \"Your Neo4j Aura password\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can alternatively set API keys such as ```OPENAI_API_KEY``` in a ```.env``` file and load them.\n", + "\n", + "[Note] This is not necessary if you've already set the required API keys in previous steps." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load API keys from .env file\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(override=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Neo4j\n", + "We have two options to start with. Cloud or local deployment.\n", + "\n", + "In this tutorial, we will user Cloud service, called ```Aura``` provided by ```Neo4j```.\n", + "\n", + "But we will also describe how to deploy ```Neo4j``` with docker.\n", + "\n", + "### Getting started with Aura\n", + "You can create a new **Neo4j Aura** account at [Neo4j](https://neo4j.com/) offical website.\n", + "\n", + "Visit web site and click Get Started Free at top right.\n", + "\n", + "If you done signing in, you will se a button, **Create instance** and after that you will see your username and password.\n", + "\n", + "To get your API Key, click **Download and continue** to download a txt file which contains API key to connect your **NEO4j Aura** .\n", + "\n", + "### Getting started with Docker\n", + "We now describe how to run ```Neo4j``` using docker.\n", + "\n", + "To run Neo4j container, we use the following command.\n", + "```\n", + "docker run \\\n", + " -itd \\\n", + " --publish=7474:7474 --publish=7687:7687 \\\n", + " --volume=$HOME/neo4j/data:/data \\\n", + " --env=NEO4J_AUTH=none \\\n", + " --name neo4j \\\n", + " neo4j\n", + "```\n", + "\n", + "You can visit **Neo4j Docker installation** reference to check more detailed information.\n", + "\n", + "**[NOTE]**\n", + "* ```Neo4j``` also supports macOS, windows and Linux native deployment. Visit **Neo4j Official Installation guide** reference for more detail.\n", + "* ```Neo4j``` community edition only supports one database." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "USvgdgjznDsd" + }, + "source": [ + "## Credentials\n", + "Now, if you successfully create your own account for Aura, you will get your ```NEO4J_URI```, ```NEO4J_USERNAME```, ```NEO4J_USERPASSWORD```.\n", + "\n", + "Add it to environmental variable above or add it to your ```.env``` file." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "QzFkuokSnL1e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected to Neo4j database\n", + "Connection info\n", + "URI=neo4j+s://3ed1167e.databases.neo4j.io\n", + "username=neo4j\n", + "Neo4j version is above 5.23\n" + ] + } + ], + "source": [ + "import os\n", + "import time\n", + "from utils.neo4j_interface import Neo4jDB\n", + "\n", + "# set uri, username, password\n", + "uri = os.getenv(\"NEO4J_URI\")\n", + "username = os.getenv(\"NEO4J_USERNAME\")\n", + "password = os.getenv(\"NEO4J_PASSWORD\")\n", + "\n", + "client = Neo4jDB(uri=uri, username=username, password=password)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we established connection to Aura ```Neo4j``` database, connection info using ```get_api_key``` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get connection info\n", + "client.get_api_key()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MWKVm4IkgHPn" + }, + "source": [ + "## Initialization\n", + "If you are succesfully connected to **Neo4j Aura**, there are some basic indexes already created.\n", + "\n", + "But, in this tutorial we will create a new indexand will add items(nodes) to it.\n", + "\n", + "To do this, we now look how to manage indexes.\n", + "\n", + "To manage indexes, we will see how to:\n", + "* List indexes\n", + "* Create new index\n", + "* Delete index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mdvN2pRtzpHB" + }, + "source": [ + "### List Indexs\n", + "Before create a new index, let's check indexes already in the ```Neo4j``` database" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "8AgTNAl5zo3E" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['index_343aff4e', 'index_f7700477']\n" + ] + } + ], + "source": [ + "# get name list of indexes\n", + "names = client.list_indexes()\n", + "\n", + "print(names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R94h5sY7oLyh" + }, + "source": [ + "### Create Index\n", + "\n", + "Now we will create a new index.\n", + "\n", + "This can be done by calling `create_index` method, which will return an object connected to newly created index.\n", + "\n", + "If an index exists with the same name, the method will print out notification.\n", + "\n", + "When we create a new index, we must provide embedding object or dimension of vector, and ```metric``` to use for similarity search.\n", + "\n", + "In this tutorial we will pass `OpenAIEmbeddings` when we create a new index.\n", + "\n", + "\n", + "**[ NOTE ]**\n", + "- If you pass dimension of vector instead of embedding object, this must match the dimension of embeded vector of your choice of embedding model.\n", + "- An embedding object must have ```embed_query``` and ```embed_documents``` methods.\n", + "- ```metric``` is used to set distance method for similarity search. ```Neo4j``` supports **cosine** and **euclidean** ." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "tR3bb-F5hCf9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created index information\n", + "Index name: tutorial_index\n", + "Node label: tutorial_node\n", + "Similarity metric: COSINE\n", + "Embedding dimension: 1536\n", + "Embedding node property: embedding\n", + "Text node property: text\n", + "\n", + "Index creation was successful\n", + "['index_343aff4e', 'index_f7700477', 'tutorial_index']\n" + ] + } + ], + "source": [ + "# Initialize OpenAIEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "# set index_name and node_label\n", + "index_name = \"tutorial_index\"\n", + "node_label = \"tutorial_node\"\n", + "\n", + "# create a new index\n", + "index = client.create_index(\n", + " embedding=embeddings, index_name=index_name, node_label=node_label\n", + ")\n", + "\n", + "if isinstance(index, Neo4jDB):\n", + " print(\"Index creation was successful\")\n", + "\n", + "# check name list of indexes\n", + "names = client.list_indexes()\n", + "print(names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AKYPHPk40c4X" + }, + "source": [ + "### Delete Index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ua5yewan0TVy" + }, + "source": [ + "We can delete specific index by calling `delete_index` method.\n", + "\n", + "Delete ```tutorial_index``` we created above and then create it again to use later." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index deleted succesfully \n", + "['index_343aff4e', 'index_f7700477']\n", + "Created index information\n", + "Index name: tutorial_index\n", + "Node label: tutorial_node\n", + "Similarity metric: COSINE\n", + "Embedding dimension: 1536\n", + "Embedding node property: embedding\n", + "Text node property: text\n", + "\n" + ] + } + ], + "source": [ + "# delete index\n", + "client.delete_index(\"tutorial_index\")\n", + "\n", + "# print name list of indexes\n", + "names = client.list_indexes()\n", + "if \"tutorial_index\" not in names:\n", + " print(f\"Index deleted succesfully \")\n", + " print(names)\n", + "\n", + "# recreate the tutorial_index\n", + "index = client.create_index(\n", + " embedding=embeddings, index_name=\"tutorial_index\", node_label=\"tutorial_node\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lwb_OMHunjwh" + }, + "source": [ + "### Select Embeddings model\n", + "\n", + "We also can change embedding model.\n", + "\n", + "In this subsection we use ```text-embedding-3-large``` model to create a new index with it" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "tRjB4EvXnoZM" + }, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "if not os.environ.get(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n", + "\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embeddings_large = OpenAIEmbeddings(model=\"text-embedding-3-large\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created index information\n", + "Index name: tutorial_index_2\n", + "Node label: tutorial_node_2\n", + "Similarity metric: COSINE\n", + "Embedding dimension: 3072\n", + "Embedding node property: embedding\n", + "Text node property: text\n", + "\n" + ] + } + ], + "source": [ + "# create new index\n", + "index2 = client.create_index(\n", + " embedding=embeddings_large,\n", + " index_name=\"tutorial_index_2\",\n", + " node_label=\"tutorial_node_2\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FtUJ2xSPrq3P" + }, + "source": [ + "### Data Preprocessing\n", + "\n", + "Below is the preprocessing process for general documents.\n", + "\n", + "- Need to extract **metadata** from documents\n", + "- Filter documents by minimum length.\n", + " \n", + "- Determine whether to use ```basename``` or not. Default is ```False```.\n", + " - ```basename``` denotes the last value of the filepath.\n", + " - For example, **document.pdf** will be the ```basename``` for the filepath **./data/document.pdf** ." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "bmQI6bOsvJbu" + }, + "outputs": [], + "source": [ + "# This is a long document we can split up.\n", + "data_path = \"./data/the_little_prince.txt\"\n", + "with open(data_path, encoding=\"utf8\") as f:\n", + " raw_text = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VYdoMc4zvPe2", + "outputId": "83cff661-a0bd-4ac3-d5dd-52c61ebd1627" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='The Little Prince\n", + "Written By Antoine de Saiot-Exupery (1900〜1944)'\n" + ] + } + ], + "source": [ + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "# define text splitter\n", + "text_splitter = RecursiveCharacterTextSplitter(\n", + " # Set a really small chunk size, just to show.\n", + " chunk_size=100,\n", + " chunk_overlap=20,\n", + " length_function=len,\n", + " is_separator_regex=False,\n", + ")\n", + "\n", + "# split raw text by splitter.\n", + "split_docs = text_splitter.create_documents([raw_text])\n", + "\n", + "# print one of documents to check its structure\n", + "print(split_docs[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we preprocess splited document to extract author, page and source metadata while fit the data to store it into `Neo4j`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "gGJ_bBJcw5af" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='The Little Prince\n", + "Written By Antoine de Saiot-Exupery (1900〜1944)' metadata={'source': 'the_little_prince.txt', 'page': 1, 'author': 'Saiot-Exupery'}\n" + ] + } + ], + "source": [ + "# preprocess raw documents\n", + "processed_docs = client.preprocess_documents(\n", + " split_docs=split_docs,\n", + " metadata_keys=[\"source\", \"page\", \"author\"],\n", + " min_length=5,\n", + " use_basename=True,\n", + " source=data_path,\n", + " author=\"Saiot-Exupery\",\n", + ")\n", + "\n", + "# print one of preprocessed document to chekc its structure\n", + "print(processed_docs[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1B7xc2p3lXPV" + }, + "source": [ + "## Manage vector store\n", + "Once you have created your vector store, we can interact with it by adding and deleting different items.\n", + "\n", + "Also, you can scroll data from the store with filter or with ```Cypher``` query." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kxxP2ecAohWb" + }, + "source": [ + "\n", + "### Add items to vector store\n", + "\n", + "We can add items to our vector store by using the ```upsert_documents``` or ```upsert_documents_parallel``` method.\n", + "\n", + "If you pass ids along with documents, then ids will be used, but if you do not pass ids, it will be created based `page_content` using md5 hash function.\n", + "\n", + "Basically, ```upsert_document``` and ```upsert_document_parallel``` methods do upsert not insert, based on **id** of the item.\n", + "\n", + "So if you provided id and want to update data, you must provide the same id that you provided at first upsertion.\n", + "\n", + "We will upsert data to index, tutorial_index, with ```upsert_documents``` method for the first half, and with ```upsert_documents_parallel``` for the second half." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "7cS0FHgalwPm" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "826cd68e8ba547ef8726bd7b7dc33670", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upserting documents...: 0%| | 0/22 [00:00= (5, 23, 0): + return True + else: + return False + + def connect(self) -> None: + """Connect to neo4j graph database. + If connection cannot be established, raise error + If connection established succesfully, prints connection info and return None + """ + return self.client + + def get_api_key(self): + return { + "NEO4J_URI": self.uri, + "NEO4J_USERNAME": self.username, + "NEO4J_PASSWORD": self.password, + } + + def create_index( + self, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + if index_name in self.list_indexes(): + print(f"index {index_name} exists") + return self._return_exist_index( + self.client, + uri=self.uri, + username=self.username, + password=self.password, + embedding=embedding, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + index_name=index_name, + metric=metric, + node_label=node_label, + _database=_database, + ) + + return self._create_new_index( + self.client, + uri=self.uri, + username=self.username, + password=self.password, + embedding=embedding, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + index_name=index_name, + metric=metric, + node_label=node_label, + _database=_database, + ) + + @classmethod + def _return_exist_index( + cls, + client, + uri, + username, + password, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + query = f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes, properties" + info = client.execute_query(query).records[0] + node_label = info["labelsOrTypes"][0] + embedding_node_property = info["properties"][0] + return cls( + uri=uri, + username=username, + password=password, + embedding=embedding, + index_name=index_name, + node_label=node_label, + _database=_database, + metric=metric, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + ) + + @classmethod + def _create_new_index( + cls, + client, + uri, + username, + password, + embedding, + embedding_node_property: str = "embedding", + text_node_property: str = "text", + index_name: str = "vector", + metric: str = "cosine", + node_label: str = "Chunk", + _database: str = "neo4j", + **kwargs, + ): + """Create new vector index in Neo4j. + + Args: + - index_name : Index name for new index. Default is `vector` + - node_label : Node label for nodes in the index. Default is `Chunk` + - embedding_node_property : Name for embedding. Default is `embedding` + - metric : Distance used to calculate similarity. Default is `cosine`. + Supports `cosine`, `euclidean`. + + Returns: + - returns True if index is created successfully + """ + + assert ( + metric in METRIC.keys() + ), f"Choose metric among {list(METRIC.keys())}. Your metric is {metric}" + + if embedding is None and kwargs.get("dimension", None) is None: + raise ValueError( + "You must provide either embedding function or dimension of resulting vector when you encode a document with your choice of embedding function." + ) + + if "dimension" in kwargs: + dimension = kwargs["dimension"] + else: + dimension = len(embedding.embed_query("foo")) + index_name = index_name + node_label = node_label + metric = METRIC[metric] + + index_query = ( + f"CREATE VECTOR INDEX {index_name} IF NOT EXISTS " + f"FOR (m:`{node_label}`) ON m.`{embedding_node_property}` " + "OPTIONS { indexConfig: { " + "`vector.dimensions`: toInteger($embedding_dimension), " + "`vector.similarity_function`: $similarity_metric }}" + ) + + parameters = { + "embedding_dimension": dimension, + "similarity_metric": metric, + } + + try: + client.execute_query( + index_query, parameters_=parameters, database=_database + ) + except Exception as e: + print("Failed to create index") + print(e) + + else: + info_str = ( + f"Index name: {index_name}\n" + f"Node label: {node_label}\n" + f"Similarity metric: {metric}\n" + f"Embedding dimension: {dimension}\n" + f"Embedding node property: {embedding_node_property}\n" + f"Text node property: {text_node_property}\n" + ) + print("Created index information") + print(info_str) + return cls( + uri=uri, + username=username, + password=password, + embedding=embedding, + index_name=index_name, + node_label=node_label, + _database=_database, + metric=metric, + embedding_node_property=embedding_node_property, + text_node_property=text_node_property, + dimension=dimension, + ) + + @classmethod + def _connect_to_index(cls, client, embedding, index_name, node_label): + return cls(index_name=index_name, embedding=embedding, node_label=node_label) + + def connect_to_index(self, index_name, embedding=None): + """Connect to existing index + Args: + - index_name: Name of index to connect + + Return: + - Neo4jDB instance + """ + query = f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + node_label = self.client.execute_query(query).records[0]["labelsOrTypes"][0] + + if embedding is not None: + self.embedding = embedding + + return self._connect_to_index( + self.client, self.embedding, index_name, node_label + ) + + def list_indexes(self): + """Get list of index in current Neo4j database. + Returns: + - list of index names + """ + + query = """ + SHOW INDEXES + """ + + indexes = self.client.execute_query(query) + + result = [record["name"] for record in indexes.records] + + return result + + def get_index(self, index_name: str) -> Dict: + """Get information for given index name + + Args: + - index_name : index name to get information. + + Returns: + Information about the index. + """ + query = f""" + SHOW INDEXES YIELD * WHERE name='{index_name}' + """ + + try: + result = self.client.execute_query(query) + except Exception as e: + print("error occured while get index information") + raise e + else: + if len(result.records) == 0: + return None + result = {k: result.records[0][k] for k in result.keys} + return result + + def delete_index(self, index_name: str) -> Union[bool, None]: + """Delete index + + Args: + - index_name : index name to delete. + + Returns: + True if index deleted successfully. + If error occured, will raise error. + """ + query = f"DROP INDEX {index_name}" + if self.get_index(index_name) is None: + return f"{index_name} does not exists" + + try: + self.client.execute_query(query) + except Exception as e: + print(f"Drop index {index_name} failed") + raise e + else: + return True + + # Query related functions + def query(self, index_name, query_vector=None, top_k=10, **kwargs): + pass + + def delete_node( + self, + index_name: str = None, + filters: List[Dict] = None, + ids: List = None, + **kwargs, + ) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - filters: Delete nodes matching these filters + - ids: Delete nodes matching these ids + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + if filters is None and ids is None: + raise AssertionError("You must provide one of filters or ids") + elif filters is not None and ids is not None: + raise AssertionError("You must provide only one of filters or ids") + + if filters is not None: + return self.delete_by_filter(index_name, filters) + elif ids is not None: + return self.delete_by_id(index_name, ids) + + def delete_by_id(self, index_name: str = None, ids: List = None, **kwargs) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - ids: Delete nodes matching these ids + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + + if index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if ids is not None: + if not isinstance(ids, list): + ids = [ids] + filter_query = f"n.id IN {ids}" + + query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" + + try: + self.client.execute_query(query) + except Exception as e: + print("Delete by filter failed") + raise e + else: + return True + + def delete_by_filter( + self, index_name: str = None, filters: Dict = None, **kwargs + ) -> bool: + """Delete nodes by filter + One of filters or ids must be provided, but not both. + + Args: + - index_name: index of nodes to delete + - filters: Delete nodes matching these filters + + Returns: + - True if deletion was successful + - raise error if deletion failed + """ + + if index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if filters is not None: + filter_queries = [] + for k, v in filters.items(): + if not isinstance(v, list): + v = [v] + filter_queries.append(f"n.{k} IN {v}") + filter_query = " AND ".join(filter_queries) + + query = prefix_query + " WHERE " + filter_query + "\nDETACH DELETE n" + + try: + self.client.execute_query(query) + except Exception as e: + print("Delete by filter failed") + raise e + else: + return True + + # Document upsert related functions + + def add_embedding(self, documents: list[Document], ids: list[str] = []) -> list: + """Encode documents + Args: + - documents: List of documents to upsert into the vectorstore + - ids: List of ids for each documents. If not provided, md5 hash function will created based on the text of each document. + + Returns: Returns (encoded_vectors, id, metadata) tuple list + """ + + texts = [doc.page_content for doc in documents] + + if not all(ids): + ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] + + metadatas = [doc.metadata for doc in documents] + + encoded = self.embedding.embed_documents(texts) + + return (texts, encoded, ids, metadatas) + + def _insert_documents(self, documents: list[Document], ids: list[str] = []): + """util function for upsert_document. + + Args: + - documents: List of Document to upsert to database + - ids: List of ids paired with documents. If not provided will be created by md5 hash function. + + Return: + - ids: List of ids upserted documents. If ids were provided this must be the same to the ids provided. + """ + + texts, encodes, ids, metadatas = self.add_embedding(documents, ids) + + parameters = { + "data": [ + {"text": text, "metadata": metadata, "embedding": encode, "id": id} + for text, metadata, encode, id in zip(texts, metadatas, encodes, ids) + ] + } + + if self.is_neo4j_above_523: + call_prefix = "CALL (row) { " + else: + call_prefix = "CALL { WITH row " + + import_query = ( + "UNWIND $data AS row " + f"{call_prefix}" + f"MERGE (c:`{self.node_label}` {{id: row.id}}) " + "WITH c, row " + f"CALL db.create.setNodeVectorProperty(c, " + f"'{self.embedding_node_property}', row.embedding) " + f"SET c.`{self.text_node_property}` = row.text " + "SET c += row.metadata " + "} IN TRANSACTIONS OF 1000 ROWS " + ) + try: + self.client.execute_query(import_query, parameters_=parameters) + except Exception as e: + if "can only be executed in an implicit transaction" in str(e): + self.client.session().run(neo4j.Query(text=import_query), parameters) + elif "failed to obtain a connection from the pool" in str(e): + time.sleep(10) + self.client.session().run(neo4j.Query(text=import_query), parameters) + + return ids + + def upsert_documents(self, documents, batch_size=32, ids=None, **kwargs): + """Upsert documents into the vectorstore + + Args: + - documents: List of documents to upsert into the vectorstore + - batch_size: Batch size of documents to add or update. Default is 32. + - kwargs: Additional keyword arguments. + if kwargs contains ids and documents contain ids, + the ids in the kwargs will receive precedence. + + Returns: + Returns list of ids of the documents upserted. + """ + assert self.index_name is not None, "You MUST connect to index first." + + if self.node_label is None and self.index_name is not None: + self.node_label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + + if ids is not None: + assert len(ids) == len( + documents + ), "Size of documents and ids must be the same" + + else: + ids = [False] * len(documents) + + id_batches = [ids[i : i + batch_size] for i in range(0, len(ids), batch_size)] + + if batch_size > len(documents): + batch_size = len(documents) + + doc_batches = [ + documents[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + + result_ids = [] + + for doc_batch, id_batch in zip(doc_batches, id_batches): + result_ids.extend(self._insert_documents(doc_batch, id_batch)) + + return result_ids + + def upsert_documents_parallel( + self, documents, batch_size=32, max_workers=10, ids=None, **kwargs + ): + """Add or update documents in the vectorstore parallel. + + Args: + documents: Documents to add to the vectorstore. + batch_size: Batch size of documents to add or update. + Default is 32. + max_workers: Number of threads to use. + Default is 10. + kwargs: Additional keyword arguments. + if kwargs contains ids and documents contain ids, + the ids in the kwargs will receive precedence. + + Returns: + List of IDs of the added texts. + + Raises: + ValueError: If the number of ids does not match the number of documents. + """ + assert self.index_name is not None, "You MUST connect to index first." + + if ids is not None: + assert len(ids) == len( + documents + ), "Size of documents and ids must be the same" + + if batch_size > len(documents): + batch_size = len(documents) + + doc_bathces = [ + documents[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + id_batches = [ + ids[i : i + batch_size] for i in range(0, len(documents), batch_size) + ] + + if self.node_label is None and self.index_name is not None: + self.node_label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(self.upsert_documents, batch, ids=ids) + for batch, ids in zip(doc_bathces, id_batches) + ] + results = [] + for future in tqdm( + as_completed(futures), total=len(futures), desc="Upserting documents..." + ): + result = future.result() + if result: + results.extend(result) + + return results + + def delete_by_query( + self, query: str, index_name: Optional[str | None] = None, **kwrags + ) -> bool: + """Delete nodes by query + Args: + - query: Cypher query + - index_name: Optional. Default is None. If specified, will delete node only in the given index + + Returns: + - True if deletion is successful else raise error + """ + try: + self.client.execute_query(query) + except Exception as e: + print(f"Error {e} occured during deletion") + raise e + else: + return True + + def scroll_by_query( + self, query: str, index_name: Optional[str | None] = None, **kwargs + ) -> List: + """Scroll nodes by query + Args: + - query: Cypher query + - index_name: Optional. Default is None. If specified, will scroll node only in the given index + + Returns: + - List of nodes if successful, else raise error + """ + try: + _result = self.client.execute_query(query) + except Exception as e: + print(f"Error {e} occured during scroll") + raise e + else: + result = [] + for record in _result.records: + result.append({k: record[k] for k in record.keys()}) + return result + + def scroll_by_filter( + self, + filters=None, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: + """Query nodes by filter or id + If none of filter or id provided, will return all nodes. + If this method is called directly from client without index_name set, all nodes will be returned. + + Args: + - filters: filter for query data + - ids: id for query data + - limit: number of nodes to return + - include_embedding: Set True to include embedded vector to result. Default is False + - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. + + Returns: + - list of nodes + """ + + if self.index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if filters is None and ids is None: + filter_query = "" + + elif filters is not None: + filter_queries = [] + for k, v in filters.items(): + if not isinstance(v, list): + v = [v] + filter_queries.append(f"n.{k} IN {v}") + filter_query = " AND ".join(filter_queries) + + elif ids is not None: + filter_queries = [] + filter_query = f"n.id IN {ids}" + + limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" + + if filter_query != "": + query = prefix_query + " WHERE " + filter_query + limit_query + else: + query = prefix_query + limit_query + + _results = self.client.execute_query(query) + + results = list() + + for _result in _results.records: + node = _result["n"] + + result = {"id": node["id"]} + if include_embedding: + result.update({"embedding": node["embedding"]}) + if include_meta is None: + include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] + result.update( + {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} + ) + results.append(result) + + return results + + def scroll_by_ids( + self, + ids=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ) -> List[Dict]: + """Query nodes by filter or id + If none of filter or id provided, will return all nodes. + If this method is called directly from client without index_name set, all nodes will be returned. + + Args: + - ids: id for query data + - limit: number of nodes to return + - include_embedding: Set True to include embedded vector to result. Default is False + - include_meta: list of metadata keys to include. If set to None, all metadatas will be included. Default is None. + + Returns: + - list of nodes + """ + + if self.index_name is not None: + label = self.client.execute_query( + f"SHOW INDEX YIELD * WHERE name='{self.index_name}' RETURN labelsOrTypes" + ).records[0]["labelsOrTypes"][0] + prefix_query = f"MATCH (n:{label})\n" + else: + prefix_query = "MATCH (n)\n" + + if ids is not None: + if not isinstance(ids, list): + ids = [ids] + filter_query = f"n.id IN {ids}" + + limit_query = "\nRETURN n" if limit is None else f"\nRETURN n LIMIT {limit}" + + query = prefix_query + " WHERE " + filter_query + limit_query + + _results = self.client.execute_query(query) + + results = list() + + for _result in _results.records: + node = _result["n"] + + result = {"id": node["id"]} + if include_embedding: + result.update({"embedding": node["embedding"]}) + if include_meta is None: + include_meta = [k for k in node.keys() if k not in ["id", "embedding"]] + result.update( + {"metadata": {k: node[k] for k in node.keys() if k in include_meta}} + ) + results.append(result) + + return results + + def scroll_nodes( + self, + filters=None, + ids=None, + query=None, + limit=10, + include_embedding=False, + include_meta=None, + **kwargs, + ): + if filters is not None: + print("Scroll nodes by filter") + return self.scroll_by_filter( + filters=filters, + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) + elif ids is not None: + print("Scroll nodes by ids") + return self.scroll_by_ids( + ids=ids, + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) + elif query is not None: + print("Scroll nodes by query") + return self.scroll_by_query(query=query) + else: + return self.scroll_by_filter( + include_embedding=include_embedding, + include_meta=include_meta, + limit=limit, + ) + + @staticmethod + def preprocess_documents( + split_docs, metadata_keys, min_length, use_basename=False, **kwargs + ): + metadata = kwargs + + if use_basename: + assert metadata.get("source", None) is not None, "source must be provided" + metadata["source"] = metadata["source"].split("/")[-1] + + result_docs = [] + for idx, doc in enumerate(split_docs): + if len(doc.page_content) < min_length: + continue + for k in metadata_keys: + doc.metadata.update({k: metadata.get(k, "")}) + doc.metadata.update({"page": idx + 1}) + result_docs.append(doc) + + return result_docs + + def search( + self, + query=None, + embeded_query=None, + index_name=None, + filters=[], + with_score=False, + top_k=3, + **kwargs, + ): + assert self.index_name is not None, "You must provide index name" + + if query is None and embeded_query is None: + raise ValueError("You must provide either query or embeded values of query") + + if query is not None and embeded_query is not None: + print( + "Both query and embeded value of query passed. Using embded value of query" + ) + + if embeded_query is None: + embeded_query = self.embedding.embed_query(query) + + if kwargs.get("include_vector"): + result_query = ( + f"MATCH (n:`{self.node_label}`) " + f"WITH n, vector.similarity.cosine($embeded, n.embedding) AS score " + f"ORDER BY score DESC " + f"RETURN r, score LIMIT $k " + f"n {{.*, `{self.text_node_property}`: Null, `{self.embedding_node_property}`: Null}} AS metadata LIMIT $k " + ) + else: + result_query = ( + f"MATCH (n:`{self.node_label}`) " + f"WITH n, vector.similarity.cosine($embeded, n.embedding) AS score " + f"ORDER BY score DESC " + f"RETURN score, " + f"n {{.*, `{self.embedding_node_property}`: Null}} AS metadata LIMIT $k " + ) + + parameters = { + "k": top_k, + "embeded": embeded_query, + } + + try: + _result = self.client.execute_query(result_query, parameters_=parameters) + except: + _result = self.client.session(database=self._database).run( + neo4j.Query(text=result_query), parameters + ) + + result = [] + for _r in _result.records: + result.append( + { + "text": _r["metadata"].pop("text"), + "metadata": _r["metadata"], + "score": round(float(_r["score"]), 3), + } + ) + + return result + + @staticmethod + def remove_lucene_chars(text: str) -> str: + """Remove Lucene special characters""" + special_chars = [ + "+", + "-", + "&", + "|", + "!", + "(", + ")", + "{", + "}", + "[", + "]", + "^", + '"', + "~", + "*", + "?", + ":", + "\\", + ] + for char in special_chars: + if char in text: + text = text.replace(char, " ") + return text.strip()