此文紀錄使用 LLama3 + Langchain + llamacpp + Chroma 組合成一個簡單的 RAG !

整體 RAG 架構圖: RAG架構圖

  1. 先讀入我們補充的資料
  2. 利用語言模型幫我們做 Embedding 幫我們的字串分析並轉成向量
  3. 把轉換後的向量存去向量資料庫
  4. 利用 LangChain 的 Library 內的 LLamaCpp 啟動我們的 LLama3 模型
  5. 我們就能輸入問題
  6. 拿到問題後他會嘗試去資料庫找相似的答案
  7. 經由 LLama3 回答問題

Code:

import os
import sys
import time
import chromadb
from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# 讀取資料 pdf, docs, docx, txt ...
documents = []
# Create a List of Documents from all of our files in the ./docs folder
for file in os.listdir("docs"):
    print(f'Now Load:{file}')
    if file.endswith(".pdf"):
        pdf_path = "./docs/" + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = "./docs/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    elif file.endswith('.txt'):
        text_path = "./docs/" + file
        loader = TextLoader(text_path)
        documents.extend(loader.load())

# 讀取並切割字串 (RecursiveCharacterTextSplitter 這一段有3種不同的切割方式有興趣的同學可以自己去看一下 Langchain 的 library)
start_time = time.time()
# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
documents = text_splitter.split_documents(documents)
end_time = time.time()
splitter_time = end_time-start_time

# 把剛剛切割的字串使用 ntfloat/multilingual-e5-large-instruct 模型去作 Embedding ,我這邊是選擇用 CPU 可以用 GPU 會比較快
start_time = time.time()
# Embedding Sentence-transformer Model
model_name = 'intfloat/multilingual-e5-large-instruct'
model_kwargs = {'device': 'cpu'}
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
end_time = time.time()
embedding_time = end_time-start_time

# 把轉好的資料放到 Chroma DB 裡,這邊是使用簡易版直接放到 memory 中
start_time = time.time()
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()
# Create collection. get_collection, get_or_create_collection, delete_collection also available!
collection = client.create_collection("all-my-documents")
persist_directory = Chroma(client=client, collection_name="all-my-documents")
# Convert the document chunks to embedding and save them to the vector store
vectordb = Chroma.from_documents(documents, embedding=embedding, persist_directory=persist_directory)
vectordb.persist()
end_time = time.time()
vectordb_time = end_time-start_time

# 啟用 LLama3 並使用 LLamaCpp 一樣可以設定參數
# Main LLM
llm = LlamaCpp(
    model_path="/root/LLM-models/Llama3-8B-Chinese-Chat-q8_0-v2_1.gguf",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=False,
    n_gpu_layers=100,
    n_batch=1600,
    n_ctx=8000,
    f16_kv=False,
    temperature=0.7,
    use_mlock=True,
    use_mmap=True,
    vocab_only=False,
)

# 設定 模版 等等... 並設定會去 DB 找再回傳給 LLama3 回答
# With Langchain Model
# This controls how the standalone question is generated.
# Should take `chat_history` and `question` as input variables.
template = (
    "Combine the chat history and follow up question into "
    "a standalone question. Chat History: {chat_history}"
    "Follow up question: {question}"
)
prompt = PromptTemplate.from_template(template)
# create our Q&A chain
pdf_qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    condense_question_prompt=prompt,
    retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True,
    verbose=False,
)

# 原版模型用來跟有 LangChain 來比對回答
# Original Model
template = """
Question: {question}

Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["question"])
# Create an LLMChain to manage interactions with the prompt and model
Ollm = LLMChain(prompt=prompt, llm=llm)


yellow = "\033[0;33m"
green = "\033[0;32m"
white = "\033[0;39m"

chat_history = []
print(f"{yellow}---------------------------------------------------------------------------------")
print(f'Init Time:')
print(f'Splitter_Time:{splitter_time} seconds')
print(f'Embedding_Time:{embedding_time} seconds')
print(f'Vectordb_Time:{vectordb_time} seconds')
print('Welcome to the DocBot. You are now ready to start interacting with your documents')
print('---------------------------------------------------------------------------------')
while True:
    query = input(f"{green}Prompt: ")
    if query == "exit" or query == "quit" or query == "q" or query == "f":
        print('Exiting')
        sys.exit()
    if query == '':
        continue
    # 有搭配找尋 chroma DB 在回答的 LLama3 回答
    result = pdf_qa.invoke(
        {"question": query, "chat_history": chat_history})
    print(f"{white}Answer: " + result["answer"])
    print("=====================================================")
    # 原版 LLama3 回答
    Ollm_answer = Ollm.invoke(query)
    print(Ollm_answer, '\n')
    print("=====================================================")
    # 紀錄功能
    #chat_history.append((query, result["answer"]))

最後就能實作出簡單的 RAG 了簡單吧!!
有興趣的同學還能串接網頁搜尋的 API 不過我看都要錢所以我就沒接了QQ

Ref:
RAG實作教學,LangChain + Llama2 |創造你的個人LLM

用筆電就能跑 LLaMA ! llama.cpp 教學