LangChain-OpenTutorial · teddylee777 · Jan 25, 2025 · Jan 25, 2025
diff --git a/17-LangGraph/02-Structures/rag/base.py b/17-LangGraph/02-Structures/rag/base.py
@@ -0,0 +1,73 @@
+from langchain_core.prompts import load_prompt
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+
+from abc import ABC, abstractmethod
+from operator import itemgetter
+from langchain import hub
+
+
+class RetrievalChain(ABC):
+    def __init__(self):
+        self.source_uri = None
+        self.k = 10
+
+    @abstractmethod
+    def load_documents(self, source_uris):
+        """Load the document using loader."""
+        pass
+
+    @abstractmethod
+    def create_text_splitter(self):
+        """Create a text splitter."""
+        pass
+
+    def split_documents(self, docs, text_splitter):
+        """Use the text splitter to split the document."""
+        return text_splitter.split_documents(docs)
+
+    def create_embedding(self):
+        return OpenAIEmbeddings(model="text-embedding-3-small")
+
+    def create_vectorstore(self, split_docs):
+        return FAISS.from_documents(
+            documents=split_docs, embedding=self.create_embedding()
+        )
+
+    def create_retriever(self, vectorstore):
+        # Create a retriever that performs searches using the MMR algorithm.
+        dense_retriever = vectorstore.as_retriever(
+            search_type="similarity", search_kwargs={"k": self.k}
+        )
+        return dense_retriever
+
+    def create_model(self):
+        return ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+
+    def create_prompt(self):
+        return hub.pull("teddynote/rag-prompt-chat-history")
+
+    @staticmethod
+    def format_docs(docs):
+        return "\n".join(docs)
+
+    def create_chain(self):
+        docs = self.load_documents(self.source_uri)
+        text_splitter = self.create_text_splitter()
+        split_docs = self.split_documents(docs, text_splitter)
+        self.vectorstore = self.create_vectorstore(split_docs)
+        self.retriever = self.create_retriever(self.vectorstore)
+        model = self.create_model()
+        prompt = self.create_prompt()
+        self.chain = (
+            {
+                "question": itemgetter("question"),
+                "context": itemgetter("context"),
+                "chat_history": itemgetter("chat_history"),
+            }
+            | prompt
+            | model
+            | StrOutputParser()
+        )
+        return self
diff --git a/17-LangGraph/02-Structures/rag/pdf.py b/17-LangGraph/02-Structures/rag/pdf.py
@@ -0,0 +1,21 @@
+from rag.base import RetrievalChain
+from langchain_community.document_loaders import PDFPlumberLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from typing import List, Annotated
+
+
+class PDFRetrievalChain(RetrievalChain):
+    def __init__(self, source_uri: Annotated[str, "Source URI"]):
+        self.source_uri = source_uri
+        self.k = 10
+
+    def load_documents(self, source_uris: List[str]):
+        docs = []
+        for source_uri in source_uris:
+            loader = PDFPlumberLoader(source_uri)
+            docs.extend(loader.load())
+
+        return docs
+
+    def create_text_splitter(self):
+        return RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
diff --git a/17-LangGraph/02-Structures/rag/prompts/rag-prompt-with-chat-history.yaml b/17-LangGraph/02-Structures/rag/prompts/rag-prompt-with-chat-history.yaml
@@ -0,0 +1,50 @@
+_type: "prompt"
+template: |
+  You are an AI assistant specializing in Question-Answering (QA) tasks within a Retrieval-Augmented Generation (RAG) system. 
+  Your primary mission is to answer questions based on provided context or chat history.
+  Ensure your response is concise and directly addresses the question without any additional narration.
+
+  ###
+
+  You may consider the previous conversation history to answer the question.
+
+  # Here's the previous conversation history:
+  {chat_history}
+
+  ###
+
+  Your final answer should be written concisely (but include important numerical values, technical terms, jargon, and names), followed by the source of the information.
+
+  # Steps
+
+  1. Carefully read and understand the context provided.
+  2. Identify the key information related to the question within the context.
+  3. Formulate a concise answer based on the relevant information.
+  4. Ensure your final answer directly addresses the question.
+  5. List the source of the answer in bullet points, which must be a file name (with a page number) or URL from the context. Omit if the answer is based on previous conversation or if the source cannot be found.
+
+  # Output Format:
+  [Your final answer here, with numerical values, technical terms, jargon, and names in their original language]
+
+  **Source**(Optional)
+  - (Source of the answer, must be a file name(with a page number) or URL from the context. Omit if the answer is based on previous conversation or can't find the source.)
+  - (list more if there are multiple sources)
+  - ...
+
+  ###
+
+  Remember:
+  - It's crucial to base your answer solely on the **provided context** or **chat history**. 
+  - DO NOT use any external knowledge or information not present in the given materials.
+  - If a user asks based on the previous conversation, but if there's no previous conversation or not enough information, you should answer that you don't know.
+
+  ###
+
+  # Here is the user's question:
+  {question}
+
+  # Here is the context that you should use to answer the question:
+  {context}
+
+  # Your final answer to the user's question:
+input_variables: ["question", "context", "chat_history"]
diff --git a/17-LangGraph/02-Structures/rag/prompts/rag-prompt.yaml b/17-LangGraph/02-Structures/rag/prompts/rag-prompt.yaml
@@ -0,0 +1,43 @@
+_type: "prompt"
+template: |
+  You are an AI assistant specializing in Question-Answering (QA) tasks within a Retrieval-Augmented Generation (RAG) system. 
+  Your primary mission is to answer questions based on provided context or chat history.
+  Ensure your response is concise and directly addresses the question without any additional narration.
+
+  ###
+
+  Your final answer should be written concisely (but include important numerical values, technical terms, jargon, and names), followed by the source of the information.
+
+  # Steps
+
+  1. Carefully read and understand the context provided.
+  2. Identify the key information related to the question within the context.
+  3. Formulate a concise answer based on the relevant information.
+  4. Ensure your final answer directly addresses the question.
+  5. List the source of the answer in bullet points, which must be a file name (with a page number) or URL from the context. Omit if the source cannot be found.
+
+  # Output Format:
+  [Your final answer here, with numerical values, technical terms, jargon, and names in their original language]
+
+  **Source**(Optional)
+  - (Source of the answer, must be a file name(with a page number) or URL from the context. Omit if you can't find the source of the answer.)
+  - (list more if there are multiple sources)
+  - ...
+
+  ###
+
+  Remember:
+  - It's crucial to base your answer solely on the **PROVIDED CONTEXT**. 
+  - DO NOT use any external knowledge or information not present in the given materials.
+  - If you can't find the source of the answer, you should answer that you don't know.
+
+  ###
+
+  # Here is the user's QUESTION that you should answer:
+  {question}
+
+  # Here is the CONTEXT that you should use to answer the question:
+  {context}
+
+  # Your final ANSWER to the user's QUESTION:
+input_variables: ["question", "context"]
diff --git a/17-LangGraph/02-Structures/rag/utils.py b/17-LangGraph/02-Structures/rag/utils.py
@@ -0,0 +1,33 @@
+def format_docs(docs):
+    return "\n".join(
+        [
+            f"<document><content>{doc.page_content}</content><source>{doc.metadata['source']}</source><page>{int(doc.metadata['page'])+1}</page></document>"
+            for doc in docs
+        ]
+    )
+
+
+def format_searched_docs(docs):
+    return "\n".join(
+        [
+            f"<document><content>{doc['content']}</content><source>{doc['url']}</source></document>"
+            for doc in docs
+        ]
+    )
+
+
+def format_task(tasks):
+    # Create an empty list to store the results
+    task_time_pairs = []
+
+    # Traverse the list and process each item
+    for item in tasks:
+        # Separate strings by colon (:)
+        task, time_str = item.rsplit(":", 1)
+        # Remove the string 'time' and convert it to an integer
+        time = int(time_str.replace("시간", "").strip())
+        # Create to-dos and times as tuples and add them to a list
+        task_time_pairs.append((task, time))
+
+    # Output the results
+    return task_time_pairs
diff --git a/17-LangGraph/03-Use-Cases/data/Newwhitepaper_Agents2.pdf b/17-LangGraph/03-Use-Cases/data/Newwhitepaper_Agents2.pdf
diff --git a/17-LangGraph/03-Use-Cases/rag/base.py b/17-LangGraph/03-Use-Cases/rag/base.py
@@ -0,0 +1,73 @@
+from langchain_core.prompts import load_prompt
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+
+from abc import ABC, abstractmethod
+from operator import itemgetter
+from langchain import hub
+
+
+class RetrievalChain(ABC):
+    def __init__(self):
+        self.source_uri = None
+        self.k = 10
+
+    @abstractmethod
+    def load_documents(self, source_uris):
+        """Load the document using loader."""
+        pass
+
+    @abstractmethod
+    def create_text_splitter(self):
+        """Create a text splitter."""
+        pass
+
+    def split_documents(self, docs, text_splitter):
+        """Use the text splitter to split the document."""
+        return text_splitter.split_documents(docs)
+
+    def create_embedding(self):
+        return OpenAIEmbeddings(model="text-embedding-3-small")
+
+    def create_vectorstore(self, split_docs):
+        return FAISS.from_documents(
+            documents=split_docs, embedding=self.create_embedding()
+        )
+
+    def create_retriever(self, vectorstore):
+        # Create a retriever that performs searches using the MMR algorithm.
+        dense_retriever = vectorstore.as_retriever(
+            search_type="similarity", search_kwargs={"k": self.k}
+        )
+        return dense_retriever
+
+    def create_model(self):
+        return ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+
+    def create_prompt(self):
+        return hub.pull("teddynote/rag-prompt-chat-history")
+
+    @staticmethod
+    def format_docs(docs):
+        return "\n".join(docs)
+
+    def create_chain(self):
+        docs = self.load_documents(self.source_uri)
+        text_splitter = self.create_text_splitter()
+        split_docs = self.split_documents(docs, text_splitter)
+        self.vectorstore = self.create_vectorstore(split_docs)
+        self.retriever = self.create_retriever(self.vectorstore)
+        model = self.create_model()
+        prompt = self.create_prompt()
+        self.chain = (
+            {
+                "question": itemgetter("question"),
+                "context": itemgetter("context"),
+                "chat_history": itemgetter("chat_history"),
+            }
+            | prompt
+            | model
+            | StrOutputParser()
+        )
+        return self
diff --git a/17-LangGraph/03-Use-Cases/rag/pdf.py b/17-LangGraph/03-Use-Cases/rag/pdf.py
@@ -0,0 +1,21 @@
+from rag.base import RetrievalChain
+from langchain_community.document_loaders import PDFPlumberLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from typing import List, Annotated
+
+
+class PDFRetrievalChain(RetrievalChain):
+    def __init__(self, source_uri: Annotated[str, "Source URI"]):
+        self.source_uri = source_uri
+        self.k = 10
+
+    def load_documents(self, source_uris: List[str]):
+        docs = []
+        for source_uri in source_uris:
+            loader = PDFPlumberLoader(source_uri)
+            docs.extend(loader.load())
+
+        return docs
+
+    def create_text_splitter(self):
+        return RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
diff --git a/17-LangGraph/03-Use-Cases/rag/prompts copy/rag-prompt-with-chat-history.yaml b/17-LangGraph/03-Use-Cases/rag/prompts copy/rag-prompt-with-chat-history.yaml
@@ -0,0 +1,50 @@
+_type: "prompt"
+template: |
+  You are an AI assistant specializing in Question-Answering (QA) tasks within a Retrieval-Augmented Generation (RAG) system. 
+  Your primary mission is to answer questions based on provided context or chat history.
+  Ensure your response is concise and directly addresses the question without any additional narration.
+
+  ###
+
+  You may consider the previous conversation history to answer the question.
+
+  # Here's the previous conversation history:
+  {chat_history}
+
+  ###
+
+  Your final answer should be written concisely (but include important numerical values, technical terms, jargon, and names), followed by the source of the information.
+
+  # Steps
+
+  1. Carefully read and understand the context provided.
+  2. Identify the key information related to the question within the context.
+  3. Formulate a concise answer based on the relevant information.
+  4. Ensure your final answer directly addresses the question.
+  5. List the source of the answer in bullet points, which must be a file name (with a page number) or URL from the context. Omit if the answer is based on previous conversation or if the source cannot be found.
+
+  # Output Format:
+  [Your final answer here, with numerical values, technical terms, jargon, and names in their original language]
+
+  **Source**(Optional)
+  - (Source of the answer, must be a file name(with a page number) or URL from the context. Omit if the answer is based on previous conversation or can't find the source.)
+  - (list more if there are multiple sources)
+  - ...
+
+  ###
+
+  Remember:
+  - It's crucial to base your answer solely on the **provided context** or **chat history**. 
+  - DO NOT use any external knowledge or information not present in the given materials.
+  - If a user asks based on the previous conversation, but if there's no previous conversation or not enough information, you should answer that you don't know.
+
+  ###
+
+  # Here is the user's question:
+  {question}
+
+  # Here is the context that you should use to answer the question:
+  {context}
+
+  # Your final answer to the user's question:
+input_variables: ["question", "context", "chat_history"]