{user_name}
commited on
Commit
ยท
b16918e
1
Parent(s):
d0c988b
Update space
Browse files- Module/rag.py +67 -0
Module/rag.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################
|
2 |
+
# PDF ํ์ผ์ ๋ก๋ํ๊ณ ๋ฌธ์๋ฅผ ์ชผ๊ฐ์ ๋ฌธ์๋ฒกํฐํ ํ ํ ์ง์ํ๊ธฐ
|
3 |
+
################
|
4 |
+
import tiktoken
|
5 |
+
tokenizer = tiktoken.get_encoding('cl100k_base')
|
6 |
+
def tiktoken_len(text):
|
7 |
+
tokens = tokenizer.encode(text)
|
8 |
+
return len(tokens)
|
9 |
+
|
10 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
from langchain.vectorstores import Chroma
|
13 |
+
from langchain.document_loaders import PyPDFLoader
|
14 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
15 |
+
|
16 |
+
## pdf ํ์ผ๋ก๋ ํ๊ณ ์ชผ๊ฐ๊ธฐ
|
17 |
+
loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf')
|
18 |
+
pages = loader.load_and_split()
|
19 |
+
|
20 |
+
## chunk๋ก ์ชผ๊ฐ๊ธฐ
|
21 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
|
22 |
+
sourceDocs = text_splitter.split_documents(pages)
|
23 |
+
|
24 |
+
################
|
25 |
+
# HuggingFace ๋ชจ๋ธ๋ก ๋ฌธ์๋ฒกํฐํ ํ ์ ์ฌ๋ ํ์
|
26 |
+
################
|
27 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
28 |
+
|
29 |
+
model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
|
30 |
+
model_kwargs = {'device':'cpu'},
|
31 |
+
encode_kwargs = {'normalize_embeddings' : True})
|
32 |
+
|
33 |
+
## Chroma ๊ธฐ๋ฐ pdf(docs ๋ฒกํฐํ)
|
34 |
+
db = Chroma.from_documents(sourceDocs, model_huggingface)
|
35 |
+
|
36 |
+
## ์ง์ํ๊ธฐ
|
37 |
+
question = '์ผ์ฑ์ ์์ ์ฃผ์ ์ฌ์
์์ญ์?'
|
38 |
+
docs3 = db.similarity_search_with_relevance_scores(question, k = 1) # 2๊ฐ ํ๋ ๋๋ฌด ๋๋ฆฐ๊ฑด๊ฐ? ๋ต๋ณ์ด ์๋์ค๋๋ฐ..?
|
39 |
+
|
40 |
+
# ํ์ผ๋ก ์ ์ฅํ๊ณ ๋ถ๋ฌ์ค๊ธฐ
|
41 |
+
# db_toFiles = Chroma.from_documents(docs, model_huggingface, persist_directory = './samsumg.db')
|
42 |
+
# db_fromfile = Chroma(persist_directory = './samsumg.db',embedding_function=model_huggingface)
|
43 |
+
# docs3 = db_fromfile.similarity_search_with_relevance_scores(question,k=3)
|
44 |
+
|
45 |
+
joinDoc = ' '.join([doc[0].page_content for doc in docs3])
|
46 |
+
print(joinDoc)
|
47 |
+
|
48 |
+
################
|
49 |
+
# ์ฐพ์ ๋ฌธ์๋ฅผ ํ๋กฌํํธ์ ์ ๋ฌํ์ฌ LLM์ผ๋ก ๋ต๋ณ ์์ฑ
|
50 |
+
################
|
51 |
+
from langchain_community.chat_models import ChatOllama
|
52 |
+
llm = ChatOllama(
|
53 |
+
base_url='http://localhost:11434',
|
54 |
+
# model="phi3:medium", # ๋๋ฌด ๋๋ ค์ mini๋ก ๋ณ๊ฒฝ
|
55 |
+
model="phi3:mini",
|
56 |
+
)
|
57 |
+
|
58 |
+
from langchain_core.prompts import ChatPromptTemplate
|
59 |
+
|
60 |
+
prompt = ChatPromptTemplate.from_messages([
|
61 |
+
("system", "Please answer the following question from the document: {document}"),
|
62 |
+
("user", "{question}"),
|
63 |
+
])
|
64 |
+
|
65 |
+
print('-'*50)
|
66 |
+
chain = prompt | llm
|
67 |
+
print(chain.invoke({"question": question, "document": joinDoc}))
|