hoan17 commited on
Commit
b6e5245
·
verified ·
1 Parent(s): 825a440

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +130 -0
  3. quyche_uit_plus_removed.pdf +3 -0
  4. requirements.txt +13 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ quyche_uit_plus_removed.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Chatbot_LLM_with_RAG Quyche_FINAL.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1kRGRGeOuF9JORajZPlEI2H0IrvcrgYr0
8
+ """
9
+
10
+
11
+ import os
12
+ import textwrap
13
+
14
+ import chromadb
15
+ import langchain
16
+ import openai
17
+ from langchain.chains import RetrievalQA
18
+ from langchain.chat_models import ChatOpenAI
19
+ from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, YoutubeLoader, PyPDFLoader
20
+ from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
21
+ from langchain.indexes import VectorstoreIndexCreator
22
+ from langchain.llms import OpenAI
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
24
+ from langchain.vectorstores import Chroma
25
+ from langchain.llms import GPT4All
26
+ from pdf2image import convert_from_path
27
+
28
+
29
+
30
+ # !pip uninstall 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a'
31
+
32
+ """Download file pdf"""
33
+
34
+ # Download file pdf
35
+ # !gdown https://drive.google.com/uc?id=19_MlM7Cmw8z_j40dk80PQbITYNET3tL2
36
+ # !gdown https://drive.google.com/uc?id=1gdM3TfvyQPDXOuFjNS9n_DgD24ThDB84
37
+
38
+ FILE_NAME="quyche_uit_plus_removed.pdf"
39
+
40
+ """Load Data & Model"""
41
+
42
+ from getpass import getpass
43
+ OPENAI_API_KEY = "sk-proj-jFDUBtItWEzg2vE9ZZhaT3BlbkFJi3l93u3z3FuQItueKZQp"
44
+
45
+ # OPENAI_API_KEY = getpass()
46
+
47
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
48
+ model = OpenAI(temperature=0, model_name="gpt-3.5-turbo")
49
+ # (trang)
50
+
51
+ images = convert_from_path(FILE_NAME, dpi=88)
52
+ # len(images)
53
+ # images[-1]
54
+
55
+ """Use UnstructuredPDFLoader to load PDFs"""
56
+
57
+ # Use UnstructuredPDFLoader to load PDFs from the Internets
58
+ pdf_loader = UnstructuredPDFLoader(FILE_NAME)
59
+ pdf_pages = pdf_loader.load_and_split()
60
+
61
+ # Text Splitters
62
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
63
+ texts = text_splitter.split_documents(pdf_pages)
64
+ # len(texts)
65
+
66
+ # texts[0]
67
+
68
+ # texts[-1]
69
+
70
+ """Create Embeddings & Vectorstores"""
71
+
72
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
73
+ hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
74
+
75
+ db = Chroma.from_documents(texts, hf_embeddings, persist_directory="db")
76
+
77
+ """#Use a Chain"""
78
+
79
+ custom_prompt_template = """Sử dụng các thông tin sau đây để trả lời câu hỏi của người dùng.
80
+ Nếu bạn không biết câu trả lời, chỉ cần nói rằng bạn không biết, đừng cố bịa ra câu trả lời.
81
+ Tất cả câu trả lời của bạn đều phải trả lời bằng tiếng việt
82
+
83
+ Context: {context}
84
+ Question: {question}
85
+
86
+ """
87
+
88
+ from langchain import PromptTemplate
89
+ def set_custom_prompt():
90
+ """
91
+ Prompt template for QA retrieval for each vectorstore
92
+ """
93
+ prompt = PromptTemplate(template=custom_prompt_template,
94
+ input_variables=['context', 'question'])
95
+ return prompt
96
+
97
+ prompt = set_custom_prompt()
98
+ chain = RetrievalQA.from_chain_type(
99
+ llm=model,
100
+ chain_type="stuff",
101
+ retriever=db.as_retriever(search_kwargs={"k": 3}),
102
+ chain_type_kwargs={'prompt': prompt}
103
+ )
104
+
105
+ """#QA Chatbot"""
106
+
107
+ def print_response(response: str):
108
+ print("\n".join(textwrap.wrap(response, width=100)))
109
+
110
+ # query = "Các môn bổ túc kiến thức của khóa cao học ngành khoa học máy tính gồm những môn nào?"
111
+ # response = chain.run(query)
112
+ # print_response(response)
113
+
114
+
115
+ # from langchain.chat_models import ChatOpenAI
116
+ from langchain.schema import AIMessage, HumanMessage
117
+ # import openai
118
+ import gradio as gr
119
+
120
+ def predict(message, history):
121
+ history_langchain_format = []
122
+ for human, ai in history:
123
+ history_langchain_format.append(HumanMessage(content=human))
124
+ history_langchain_format.append(AIMessage(content=ai))
125
+ history_langchain_format.append(HumanMessage(content=message))
126
+ # gpt_response = llm(history_langchain_format)
127
+ return chain.run(message)
128
+
129
+ chatbot=gr.ChatInterface(predict)
130
+ chatbot.launch(share=True)
quyche_uit_plus_removed.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7241999a904618f37690171837ca12cded7b85ad175520dd7e4e9ac71fb9fcd2
3
+ size 7643952
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ openai==0.27.4
3
+ watermark
4
+ poppler-utils
5
+ langchain==0.0.173
6
+ langchain_community
7
+ chromadb==0.3.23
8
+ pypdf==3.8.1
9
+ pygpt4all==1.1.0
10
+ pdf2image==1.16.3
11
+ tiktoken==0.3.3
12
+ unstructured[local-inference]==0.5.12
13
+ gradio==3.38.0