x3
June 15, 2024
0.1 RAG Application using LangChain and Open Source Models:
[35]: !pip install -q langchain langchain_community pypdf sentence-transformers␣
↪faiss-gpu ctransformers
[36]: from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
[37]: DATA_DIR_PATH = "/content/"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 200
VECTOR_DB_PATH = "/content/"
EMBEDDER = "thenlper/gte-large" # Huggingface Embedding model
def chunk_and_store():
dir_loader = DirectoryLoader(
DATA_DIR_PATH,
glob='*.pdf',
loader_cls=PyPDFLoader
)
docs = dir_loader.load()
print("PDFs Loaded & Chunking starts...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
inp_txt = text_splitter.split_documents(docs)
print("Data Chunks Created & Vector storing starts...")
hfembeddings = HuggingFaceEmbeddings(
model_name=EMBEDDER,
model_kwargs={'device': 'cuda'}
1
)
db = FAISS.from_documents(inp_txt, hfembeddings)
db.save_local(VECTOR_DB_PATH)
print("Vector Store Creation Completed")
[38]: from langchain import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import CTransformers
from langchain.chains import RetrievalQA
[39]: PROMPT_TEMPLATE = '''
With the information provided try to answer the question.
You are an expert in the field. Use the following context to answer the␣
↪question as accurately as possible.
If the context does not contain enough information to answer the question,␣
↪please state that explicitly.
Context: {context}
Question: {question}
Answer:
'''
INP_VARS = ['context', 'question']
CHAIN_TYPE = "stuff"
SEARCH_KWARGS = {'k': 1}
MODEL_CKPT = "TheBloke/Llama-2-7B-Chat-GGML"
MODEL_TYPE = "llama"
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.9
[40]: class EduBotCreator:
def __init__(self):
self.prompt_temp = PROMPT_TEMPLATE
self.input_variables = INP_VARS
self.chain_type = CHAIN_TYPE
self.search_kwargs = SEARCH_KWARGS
self.embedder = EMBEDDER
self.vector_db_path = VECTOR_DB_PATH
self.model_ckpt = MODEL_CKPT
self.model_type = MODEL_TYPE
self.max_new_tokens = MAX_NEW_TOKENS
self.temperature = TEMPERATURE
2
def create_prompt(self):
custom_prompt_temp = PromptTemplate(template=self.prompt_temp,
input_variables=self.input_variables)
return custom_prompt_temp
def load_llm(self):
llm = CTransformers(model = self.model_ckpt,
model_type=self.model_type,
max_new_tokens = self.max_new_tokens,
temperature = self.temperature
)
return llm
def load_vectordb(self):
hfembeddings = HuggingFaceEmbeddings(model_name=self.embedder,
model_kwargs={'device': 'cuda'}
)
vector_db = FAISS.load_local(self.vector_db_path, hfembeddings,␣
↪allow_dangerous_deserialization=True)
return vector_db
def create_bot(self, custom_prompt, vectordb, llm):
retrieval_qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type=self.chain_type,
retriever=vectordb.
↪as_retriever(search_kwargs=self.search_kwargs),
return_source_documents=True,
chain_type_kwargs={"prompt": custom_prompt}
)
return retrieval_qa_chain
def create_chatbot(self):
self.custom_prompt = self.create_prompt()
self.vector_db = self.load_vectordb()
self.llm = self.load_llm()
self.bot = self.create_bot(self.custom_prompt, self.vector_db, self.llm)
return self.bot
[41]: chunk_and_store()
PDFs Loaded & Chunking starts…
Data Chunks Created & Vector storing starts…
/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132:
FutureWarning: `resume_download` is deprecated and will be removed in version
1.0.0. Downloads always resume when possible. If you want to force a new
3
download, use `force_download=True`.
warnings.warn(
Vector Store Creation Completed
[42]: edubot_creator = EduBotCreator()
edubot = edubot_creator.create_chatbot()
Fetching 1 files: 0%| | 0/1 [00:00<?, ?it/s]
Fetching 1 files: 0%| | 0/1 [00:00<?, ?it/s]
[43]: # Function to ask question from the bot
def ask_question(bot, question):
query = {"query": question}
result = bot(query)
return result["result"]
# Example usage
question = "What is the main topic of the first PDF?"
answer = ask_question(edubot, question)
print(f"Answer: {answer}")
Answer: The main topic of the first PDF is the improvement of word embeddings
using large unlabeled data sets.
[44]: question = "What is Part-Of-Speech Tagging?" # I have uploaded a file called␣
↪'Natural_Language_Processing_Almost_from_Scratch.pdf"
answer = ask_question(edubot, question)
print(f"Answer: {answer}")
Answer: Part-Of-Speech (POS) tagging is the task of assigning a word or phrase a
label that indicates its part of speech. This label can be a single word, such
as "noun", "verb", "adjective", etc., or it can be a more detailed label that
indicates the specific class of words that the word belongs to, such as "noun",
"countable noun", "uncountable noun", "verb", "past tense", "present tense",
etc. The goal of POS tagging is to accurately identify the part of speech for
each word in a sentence or text, which can be useful in various applications
such as language modeling, natural language processing, and information
retrieval.
[44]: