ChaBotRAG / rag.py
{user_name}
์˜์–ด๋ฅผ ๊ธฐ๋ณธ์œผ๋กœ ์„ค์ •ํ•จ
207d98f
################
# PDF ํŒŒ์ผ์„ ๋กœ๋“œํ•˜๊ณ  ๋ฌธ์„œ๋ฅผ ์ชผ๊ฐœ์„œ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ•œ ํ›„ ์งˆ์˜ํ•˜๊ธฐ
################
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
tokens = tokenizer.encode(text)
return len(tokens)
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
## pdf ํŒŒ์ผ๋กœ๋“œ ํ•˜๊ณ  ์ชผ๊ฐœ๊ธฐ
# https://python.langchain.com/v0.2/docs/how_to/document_loader_markdown/
# ๋งˆํฌ๋‹ค์šด ํŒŒ์ผ์„ ๋กœ๋“œํ•˜๊ณ  ๋ถ„ํ• 
loader = UnstructuredMarkdownLoader('Document/Knowledge.md', mode="elements")
pages = loader.load_and_split()
# ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80, length_function=tiktoken_len)
sourceDocs = text_splitter.split_documents(pages)
sourceDocs = filter_complex_metadata(sourceDocs)
################
# HuggingFace ๋ชจ๋ธ๋กœ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ›„ ์œ ์‚ฌ๋„ ํƒ์ƒ‰
################
from langchain.vectorstores import Chroma
model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
model_kwargs = {'device':'cpu'},
encode_kwargs = {'normalize_embeddings' : True})
## Chroma ๊ธฐ๋ฐ˜ pdf(docs ๋ฒกํ„ฐํ™”)
db = Chroma.from_documents(sourceDocs, model_huggingface)
## ์งˆ์˜ํ•˜๊ธฐ
def SearchDocs(question, k=4):
results = db.similarity_search_with_relevance_scores(question, k = k)
merged = ''
for result in results:
merged += '\n\n' + result[0].page_content
return merged
# # ์งˆ์˜ ํ…Œ์ŠคํŠธ
# question = "์ž์—ฐ์–ด ์ฒ˜๋ฆฌ๋ž€ ๋ฌด์—‡์ธ๊ฐ€์š”?"
# print(SearchDocs(question, k=1))