tree3po commited on
Commit
333db1f
·
verified ·
1 Parent(s): 4383c1b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from langchain_huggingface import HuggingFaceEndpoint,HuggingFaceEmbeddings,ChatHuggingFace
4
+ from langchain_core.load import dumpd, dumps, load, loads
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.callbacks import StreamingStdOutCallbackHandler
7
+
8
+ from langchain_chroma import Chroma
9
+ from langchain_core.documents import Document
10
+ from langchain_text_splitters import CharacterTextSplitter
11
+ from pypdf import PdfReader
12
+ import random
13
+
14
+ token=""
15
+ #repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
16
+ repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
17
+ emb = "sentence-transformers/all-mpnet-base-v2"
18
+ hf = HuggingFaceEmbeddings(model_name=emb)
19
+ db = Chroma(persist_directory="./chroma_langchain_db")
20
+ db.persist()
21
+ # Load the document, split it into chunks, embed each chunk and load it into the vector store.
22
+ #raw_documents = TextLoader('state_of_the_union.txt').load()
23
+ def embed_fn(inp):
24
+ print("Try Embeddings")
25
+ print(inp)
26
+ print("End Embeddings")
27
+ #for eaa in inp:
28
+ text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=10)
29
+ #documents = text_splitter.split_documents([eaa])
30
+ documents = text_splitter.split_text(inp)
31
+ print("documents")
32
+ print(documents)
33
+ print("end documents")
34
+ out_emb= hf.embed_documents(documents)
35
+ #chain = history[:-1]
36
+ string_representation = dumps(out_emb, pretty=True)
37
+ print(string_representation)
38
+ #db = Chroma(collection_name="test1", embedding_function=HuggingFaceEmbeddings())
39
+ db.from_texts(documents,HuggingFaceEmbeddings(model_name=emb))
40
+ #from_documents(documents, HuggingFaceEmbeddings)
41
+ print("DB")
42
+ print(db)
43
+ print("end DB")
44
+ #return db
45
+ def proc_doc(doc_in):
46
+ for doc in doc_in:
47
+ if doc.endswith(".txt"):
48
+ yield [["",f"Loading Document: {doc}"]]
49
+ outp = read_txt(doc)
50
+ embed_fn(outp)
51
+ yield [["","Loaded"]]
52
+ elif doc.endswith(".pdf"):
53
+ yield [["",f"Loading Document: {doc}"]]
54
+ outp = read_pdf(doc)
55
+ embed_fn(outp)
56
+ yield [["","Loaded"]]
57
+
58
+
59
+ def read_txt(txt_path):
60
+ text=""
61
+ with open(txt_path,"r") as f:
62
+ text = f.read()
63
+ f.close()
64
+ return text
65
+
66
+ def read_pdf(pdf_path):
67
+ text=""
68
+ reader = PdfReader(f'{pdf_path}')
69
+ number_of_pages = len(reader.pages)
70
+ for i in range(number_of_pages):
71
+ page = reader.pages[i]
72
+ text = f'{text}\n{page.extract_text()}'
73
+ return text
74
+ def run_llm(input_text,history):
75
+ MAX_TOKENS=20000
76
+ qur= hf.embed_query(input_text)
77
+ docs = db.similarity_search_by_vector(qur, k=3)
78
+
79
+ '''if len(docs) >2:
80
+
81
+ doc_list = str(docs).split(" ")
82
+ if len(doc_list) > MAX_TOKENS:
83
+ doc_cnt = int(len(doc_list) / MAX_TOKENS)
84
+ print(doc_cnt)
85
+ for ea in doc_cnt:'''
86
+
87
+
88
+ print(docs)
89
+
90
+ callbacks = [StreamingStdOutCallbackHandler()]
91
+ llm = HuggingFaceEndpoint(
92
+ endpoint_url=repo_id,
93
+ max_new_tokens=2056,
94
+ seed=random.randint(1,99999999999),
95
+ top_k=10,
96
+ top_p=0.95,
97
+ typical_p=0.95,
98
+ temperature=0.01,
99
+ repetition_penalty=1.03,
100
+ #callbacks=callbacks,
101
+ streaming=True,
102
+ huggingfacehub_api_token=token,
103
+ )
104
+
105
+
106
+ '''llm=HuggingFaceEndpoint(
107
+ endpoint_url=repo_id,
108
+ streaming=True,
109
+ max_new_tokens=2400,
110
+ huggingfacehub_api_token=token)'''
111
+ print(input_text)
112
+ print(history)
113
+ out=""
114
+ #prompt = ChatPromptTemplate.from_messages(
115
+ sys_prompt = f"Use this data to help answer users questions: {str(docs)}"
116
+ user_prompt = f"{input_text}"
117
+ prompt=[
118
+ {"role": "system", "content": f"[INST] Use this data to help answer users questions: {str(docs)} [/INST]"},
119
+ {"role": "user", "content": f"[INST]{input_text}[/INST]"},
120
+ ]
121
+ #chat = ChatHuggingFace(llm=llm, verbose=True)
122
+ messages = [
123
+ ("system", f"[INST] Use this data to help answer users questions: {str(docs)} [/INST]"),
124
+ ("user", f"[INST]{input_text}[/INST]"),
125
+ ]
126
+
127
+ #yield(llm.invoke(prompt))
128
+
129
+ t=llm.invoke(prompt)
130
+ for chunk in t:
131
+ out+=chunk
132
+ yield out
133
+
134
+
135
+ css="""
136
+ #component-0 {
137
+ height:400px;
138
+ }
139
+ """
140
+
141
+ with gr.Blocks(css=css) as app:
142
+ data=gr.State()
143
+ with gr.Column():
144
+ #input_text = gr.Textbox(label="You: ")
145
+ chat = gr.ChatInterface(
146
+ fn=run_llm,
147
+ type="tuples",
148
+ concurrency_limit=20,
149
+
150
+ )
151
+ with gr.Row():
152
+ msg=gr.HTML()
153
+ file_in=gr.Files(file_count="multiple")
154
+ file_in.change(proc_doc, file_in, msg)
155
+ #btn = gr.Button("Generate")
156
+ app.queue().launch()