################ | |
# PDF ํ์ผ์ ๋ก๋ํ๊ณ ๋ฌธ์๋ฅผ ์ชผ๊ฐ์ ๋ฌธ์๋ฒกํฐํ ํ ํ ์ง์ํ๊ธฐ | |
################ | |
import tiktoken | |
tokenizer = tiktoken.get_encoding('cl100k_base') | |
def tiktoken_len(text): | |
tokens = tokenizer.encode(text) | |
return len(tokens) | |
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.embeddings import HuggingFaceEmbeddings | |
## pdf ํ์ผ๋ก๋ ํ๊ณ ์ชผ๊ฐ๊ธฐ | |
loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf') | |
pages = loader.load_and_split() | |
## chunk๋ก ์ชผ๊ฐ๊ธฐ | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len) | |
sourceDocs = text_splitter.split_documents(pages) | |
################ | |
# HuggingFace ๋ชจ๋ธ๋ก ๋ฌธ์๋ฒกํฐํ ํ ์ ์ฌ๋ ํ์ | |
################ | |
from langchain.embeddings import HuggingFaceEmbeddings | |
model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', | |
model_kwargs = {'device':'cpu'}, | |
encode_kwargs = {'normalize_embeddings' : True}) | |
## Chroma ๊ธฐ๋ฐ pdf(docs ๋ฒกํฐํ) | |
db = Chroma.from_documents(sourceDocs, model_huggingface) | |
## ์ง์ํ๊ธฐ | |
def SearchDocs(question, k=1): | |
results = db.similarity_search_with_relevance_scores(question, k = k) | |
merged = ' '.join([sourceDocs[result[0]][0] for result in results]) | |
return merged | |
# ################ | |
# # ์ฐพ์ ๋ฌธ์๋ฅผ ํ๋กฌํํธ์ ์ ๋ฌํ์ฌ LLM์ผ๋ก ๋ต๋ณ ์์ฑ | |
# ################ | |
# from langchain_community.chat_models import ChatOllama | |
# llm = ChatOllama( | |
# base_url='http://localhost:11434', | |
# # model="phi3:medium", # ๋๋ฌด ๋๋ ค์ mini๋ก ๋ณ๊ฒฝ | |
# model="phi3:mini", | |
# ) | |
# from langchain_core.prompts import ChatPromptTemplate | |
# prompt = ChatPromptTemplate.from_messages([ | |
# ("system", "Please answer the following question from the document: {document}"), | |
# ("user", "{question}"), | |
# ]) | |
# # print('-'*50) | |
# chain = prompt | llm | |
# def Response(question): | |
# searchedDocs = SearchDocs(question) | |
# mergedDoc = ' '.join(searchedDocs[0][0]) | |
# return chain.invoke({"question": question, "document": mergedDoc}) |