Spaces:

Carlosito16
/

extraGPT

Paused

App Files Files Community

Carlosito16 commited on Aug 23, 2023

Commit

986ac67

•

1 Parent(s): 4d045a8

Upload 3 files

Browse files

Files changed (3) hide show

pages/1_data.py +150 -0
pages/2_model.py +51 -0
pages/3_chat.py +28 -0

pages/1_data.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import streamlit as st
+import pandas as pd
+import copy
+from googletrans import Translator
+from langchain.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from streamlit_extras.row import row
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from collections import Counter
+import torch
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS
+from langchain import HuggingFacePipeline
+from langchain.chains import RetrievalQA
+from collections import Counter
+if 'faiss_db' not in st.session_state:
+    st.session_state['faiss_db'] = 0
+if 'chunked_count_list' not in st.session_state:
+    st.session_state['chunked_count_list'] = 0
+if 'chunked_df' not in st.session_state:
+    st.session_state['chunked_df'] = 0
+def make_clickable(link):
+    text = link.split()[0]
+    return f'<a target="_blank" href="{link}">{text}</a>'
+user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36'
+headers = {'User-Agent': user_agent}
+def scrape_url(url_list):
+    all_whole_text = []
+    for url in url_list:
+        main_url = url
+        html_doc = requests.get(main_url, headers =headers )
+        soup = BeautifulSoup(html_doc.text, 'html.parser')
+        whole_text = ""
+        for paragraph in soup.find_all():
+            if paragraph.name in ["p", "ol"]:
+                whole_text += paragraph.text.replace('\xa0', '').replace('\n', '').strip()
+        all_whole_text.append(whole_text)
+    return all_whole_text
+def create_count_list(chuked_text):
+    original_count_list = []
+    for item in range(len(chuked_text)):
+        original_count_list.append(chuked_text[item].metadata['document'])
+    item_counts = Counter(original_count_list)
+    count_list = list(item_counts.values())
+    return count_list
+def thai_to_eng(text):
+    translated = translator.translate(text, src='th', dest ='en')
+    return translated
+def eng_to_thai(text):
+    translated = translator.translate(text, src='en', dest ='th')
+    return translated
+# st.set_page_config(page_title=None, page_icon=None, layout="wide")
+url_list =  ["https://www.mindphp.com/คู่มือ/openerp-manual.html#google_vignette",
+                "https://www.mindphp.com/คู่มือ/openerp-manual/7874-refund.html",
+                "https://www.mindphp.com/คู่มือ/openerp-manual/8842-50-percent-discount-on-erp.html",
+                "https://www.mindphp.com/คู่มือ/openerp-manual/7873-hr-payroll-account.html",
+                "https://www.mindphp.com/คู่มือ/openerp-manual/4255-supplier-payments.html"]#or whatever default
+metadatas = [{"document": i, "url" : j} for i, j in enumerate(url_list)]
+scrape_list = scrape_url(url_list)
+translator = Translator()
+splitter_row = row([2, 2, 1], vertical_align="bottom")
+var1 = splitter_row.number_input("Chunk Size", value = 1200)
+var2 = splitter_row.number_input("Chunk Overlap Size", value = 100)
+split_button = splitter_row.button("Split the data")
+if split_button:
+    text_splitter = RecursiveCharacterTextSplitter(
+        # Set a really small chunk size, just to show.
+        chunk_size = var1,
+        chunk_overlap  = var2,
+        length_function = len
+    )
+    chuked_text = text_splitter.create_documents([doc for doc in scrape_list], metadatas = metadatas)
+    chunked_count_list = create_count_list(chuked_text)
+    print(len(url_list), len(chunked_count_list))
+    url_dataframe = pd.DataFrame({'link': url_list, 'number_of_chunks': chunked_count_list})
+    url_dataframe['link'] = url_dataframe['link'].apply(make_clickable)
+    url_dataframe = url_dataframe.to_html(escape=False)
+    st.session_state['chunked_df'] = url_dataframe
+    st.write(url_dataframe, unsafe_allow_html=True)
+        # st.dataframe(url_dataframe)
+    with st.expander("chunked items"):
+        st.json(chuked_text)
+    translated_chunk_text = copy.deepcopy(chuked_text)
+    for chunk in range(len(translated_chunk_text)):
+        translated_chunk_text[chunk].page_content = thai_to_eng(translated_chunk_text[chunk].page_content).text
+    # st.json(translated_chunk_text)
+    embedding_model = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-base',
+                                                    model_kwargs = {'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')})
+    faiss_db = FAISS.from_documents(translated_chunk_text, embedding_model)
+    st.session_state['faiss_db'] = faiss_db
+    st.session_state['chunked_count_list'] = chunked_count_list
+    st.write('successfully preprocessed data ✅')

pages/2_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+import torch
+from langchain import HuggingFacePipeline
+from langchain.chains import RetrievalQA
+from streamlit_extras.row import row
+if 'model' not in st.session_state:
+    st.session_state['model'] = 0
+if 'max_length' not in st.session_state:
+    st.session_state['max_length'] = 0
+if 'temperature' not in st.session_state:
+    st.session_state['temperature'] = 0
+if 'repetition_penalty' not in st.session_state:
+    st.session_state['repetition_penalty'] = 0
+def load_llm_model(max_length, temperature,  repetition_penalty):
+    # llm = HuggingFacePipeline.from_model_id(model_id= 'lmsys/fastchat-t5-3b-v1.0',
+    #                                         task= 'text2text-generation',
+    #                                         model_kwargs={ "device_map": "auto",
+    #                                                     "load_in_8bit": True,"max_length": 256, "temperature": 0,
+    #                                                     "repetition_penalty": 1.5})
+    llm = HuggingFacePipeline.from_model_id(model_id= 'lmsys/fastchat-t5-3b-v1.0',
+                                        task= 'text2text-generation',
+                                        model_kwargs={ "max_length": max_length, "temperature": temperature,
+                                                      "torch_dtype":torch.float32,
+                                                    "repetition_penalty": repetition_penalty})
+    return llm
+model_row = row([2, 2, 2], vertical_align="bottom")
+max_length = model_row.number_input("max_length", value = 256)
+temperature = model_row.number_input("temperature", value = 0)
+repetition_penalty = model_row.number_input("repetition_penalty", value = 1.3)
+load_model_button = st.button("load model")
+if load_model_button:
+    st.session_state['max_length'] = max_length
+    st.session_state['temperature'] = temperature
+    st.session_state['repetition_penalty'] = repetition_penalty
+    st.session_state['model'] = load_llm_model(max_length, temperature, repetition_penalty)
+    st.write('successfully model loaded ✅')
+    st.markdown(st.session_state['max_length'])
+    st.markdown(st.session_state['temperature'])
+    st.markdown(st.session_state['repetition_penalty'])

pages/3_chat.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import streamlit as st
+from streamlit_extras.stateful_chat import chat, add_message
+from langchain.chains import RetrievalQA
+with st.expander("key information"):
+    st.write(  st.session_state['chunked_df'], unsafe_allow_html=True)
+    st.markdown(st.session_state['max_length'])
+    st.markdown(st.session_state['temperature'])
+    st.markdown(st.session_state['repetition_penalty'])
+# qa_retriever = RetrievalQA.from_chain_type(llm=st.session_state['llm_model'] , chain_type="stuff",
+#                         retriever=st.session_state['faiss_db'].as_retriever())
+# with chat(key="my_chat"):
+#     if prompt := st.chat_input():
+#         add_message("user", prompt, avatar="🧑‍💻")
+#         # def stream_echo():
+#         #     for word in prompt.split():
+#         #         yield word + " "
+#         #         time.sleep(0.15)
+#         add_message("assistant", "Echo: ", qa_retriever.run(prompt), avatar="🦜")
+# query = "How to process docuemnts about HR"
+# docs = st.session_state['faiss_db'].similarity_search(query)