lfoppiano commited on
Commit
844c34d
β€’
1 Parent(s): d251baf

update possibilty to use mistral

Browse files
Files changed (2) hide show
  1. README.md +2 -3
  2. streamlit_app.py +48 -24
README.md CHANGED
@@ -4,15 +4,14 @@
4
 
5
  Question/Answering on documents. Upload your document and ask questions.
6
  In our implementation we use [Grobid](https://github.com/kermitt2/grobid) for text extraction instead of the raw PDF2Text converter.
7
- Thanks to Grobid we are able to precisely extract abstract and full-text.
8
  This is just the beginning and publishing might help gathering more feedback.
9
 
10
  **Work in progress**
11
 
12
  https://document-insights.streamlit.app/
13
 
14
- **OpenAI Key required**: At the moment you need the OpenAi Key to use this app.
15
- Google or look for example [here](https://medium.com/geekculture/openai-a-step-by-step-guide-to-getting-your-api-key-gpt-usage-control-artificial-intelligence-2a0917c70f3f), on how to generate one.
16
 
17
 
18
  ### Screencast
 
4
 
5
  Question/Answering on documents. Upload your document and ask questions.
6
  In our implementation we use [Grobid](https://github.com/kermitt2/grobid) for text extraction instead of the raw PDF2Text converter.
7
+ Thanks to Grobid we are able to precisely extract abstract and full-text.
8
  This is just the beginning and publishing might help gathering more feedback.
9
 
10
  **Work in progress**
11
 
12
  https://document-insights.streamlit.app/
13
 
14
+ **OpenAI or HuggingFace API KEY required**
 
15
 
16
 
17
  ### Screencast
streamlit_app.py CHANGED
@@ -3,20 +3,21 @@ from hashlib import blake2b
3
  from tempfile import NamedTemporaryFile
4
 
5
  import dotenv
 
6
 
7
  dotenv.load_dotenv(override=True)
8
 
9
  import streamlit as st
10
  from langchain.chat_models import PromptLayerChatOpenAI
11
- from langchain.embeddings import OpenAIEmbeddings
12
 
13
  from document_qa_engine import DocumentQAEngine
14
 
15
  if 'rqa' not in st.session_state:
16
  st.session_state['rqa'] = None
17
 
18
- if 'openai_key' not in st.session_state:
19
- st.session_state['openai_key'] = False
20
 
21
  if 'doc_id' not in st.session_state:
22
  st.session_state['doc_id'] = None
@@ -44,15 +45,23 @@ def new_file():
44
 
45
 
46
  @st.cache_resource
47
- def init_qa(openai_api_key):
48
- chat = PromptLayerChatOpenAI(model_name="gpt-3.5-turbo",
49
- temperature=0,
50
- return_pl_id=True,
51
- pl_tags=["streamlit", "chatgpt"],
52
- openai_api_key=openai_api_key)
53
- # chat = ChatOpenAI(model_name="gpt-3.5-turbo",
54
- # temperature=0)
55
- return DocumentQAEngine(chat, OpenAIEmbeddings(openai_api_key=openai_api_key), grobid_url=os.environ['GROBID_URL'])
 
 
 
 
 
 
 
 
56
 
57
 
58
  def get_file_hash(fname):
@@ -77,14 +86,28 @@ def play_old_messages():
77
  st.write(message['content'])
78
 
79
 
80
- has_openai_api_key = False
81
- if not st.session_state['openai_key']:
82
- openai_api_key = st.sidebar.text_input('OpenAI API Key')
83
- if openai_api_key:
84
- st.session_state['openai_key'] = has_openai_api_key = True
85
- st.session_state['rqa'] = init_qa(openai_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  else:
87
- has_openai_api_key = st.session_state['openai_key']
88
 
89
  st.title("πŸ“ Document insight Q&A")
90
  st.subheader("Upload a PDF document, ask questions, get insights.")
@@ -92,7 +115,7 @@ st.subheader("Upload a PDF document, ask questions, get insights.")
92
  upload_col, radio_col, context_col = st.columns([7, 2, 2])
93
  with upload_col:
94
  uploaded_file = st.file_uploader("Upload an article", type=("pdf", "txt"), on_change=new_file,
95
- disabled=not has_openai_api_key,
96
  help="The file will be uploaded to Grobid, extracted the text and calculated "
97
  "embeddings of each paragraph which are then stored to a Db for be picked "
98
  "to answer specific questions. ")
@@ -113,19 +136,20 @@ question = st.chat_input(
113
 
114
  with st.sidebar:
115
  st.header("Documentation")
116
- st.write("""To upload the PDF file, click on the designated button and select the file from your device.""")
 
117
 
118
- st.write(
119
  """After uploading, please wait for the PDF to be processed. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
120
 
121
  st.markdown("**Revision number**: [" + st.session_state[
122
  'git_rev'] + "](https://github.com/lfoppiano/grobid-magneto/commit/" + st.session_state['git_rev'] + ")")
123
 
124
  st.header("Query mode (Advanced use)")
125
- st.write(
126
  """By default, the mode is set to LLM (Language Model) which enables question/answering. You can directly ask questions related to the PDF content, and the system will provide relevant answers.""")
127
 
128
- st.write(
129
  """If you switch the mode to "Embedding," the system will return specific paragraphs from the document that are semantically similar to your query. This mode focuses on providing relevant excerpts rather than answering specific questions.""")
130
 
131
  if uploaded_file and not st.session_state.loaded_embeddings:
 
3
  from tempfile import NamedTemporaryFile
4
 
5
  import dotenv
6
+ from langchain.llms.huggingface_hub import HuggingFaceHub
7
 
8
  dotenv.load_dotenv(override=True)
9
 
10
  import streamlit as st
11
  from langchain.chat_models import PromptLayerChatOpenAI
12
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
13
 
14
  from document_qa_engine import DocumentQAEngine
15
 
16
  if 'rqa' not in st.session_state:
17
  st.session_state['rqa'] = None
18
 
19
+ if 'api_key' not in st.session_state:
20
+ st.session_state['api_key'] = False
21
 
22
  if 'doc_id' not in st.session_state:
23
  st.session_state['doc_id'] = None
 
45
 
46
 
47
  @st.cache_resource
48
+ def init_qa(api_key, model):
49
+ if model == 'chatgpt-3.5-turbo':
50
+ chat = PromptLayerChatOpenAI(model_name="gpt-3.5-turbo",
51
+ temperature=0,
52
+ return_pl_id=True,
53
+ pl_tags=["streamlit", "chatgpt"],
54
+ openai_api_key=api_key)
55
+ embeddings = OpenAIEmbeddings(openai_api_key=api_key)
56
+ elif model == 'mistral-7b-instruct-v0.1':
57
+ chat = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1",
58
+ model_kwargs={"temperature": 0.01},
59
+ api_key=api_key)
60
+ embeddings = HuggingFaceEmbeddings(
61
+ model_name="all-MiniLM-L6-v2",
62
+ api_key=api_key)
63
+
64
+ return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
65
 
66
 
67
  def get_file_hash(fname):
 
86
  st.write(message['content'])
87
 
88
 
89
+ model = st.sidebar.radio("Model", ("chatgpt-3.5-turbo", "mistral-7b-instruct-v0.1"),
90
+ index=1,
91
+ captions=[
92
+ "ChatGPT 3.5 Turbo + Ada-002-text (embeddings)",
93
+ "Mistral-7B-Instruct-V0.1 + Sentence BERT (embeddings)"
94
+ ],
95
+ help="Select the model you want to use.")
96
+
97
+ is_api_key_provided = False
98
+ if not st.session_state['api_key']:
99
+ if model == 'mistral-7b-instruct-v0.1':
100
+ api_key = st.sidebar.text_input('Huggingface API Key')
101
+ if api_key:
102
+ st.session_state['api_key'] = is_api_key_provided = True
103
+ st.session_state['rqa'] = init_qa(api_key)
104
+ elif model == 'chatgpt-3.5-turbo':
105
+ api_key = st.sidebar.text_input('OpenAI API Key')
106
+ if api_key:
107
+ st.session_state['api_key'] = is_api_key_provided = True
108
+ st.session_state['rqa'] = init_qa(api_key)
109
  else:
110
+ is_api_key_provided = st.session_state['api_key']
111
 
112
  st.title("πŸ“ Document insight Q&A")
113
  st.subheader("Upload a PDF document, ask questions, get insights.")
 
115
  upload_col, radio_col, context_col = st.columns([7, 2, 2])
116
  with upload_col:
117
  uploaded_file = st.file_uploader("Upload an article", type=("pdf", "txt"), on_change=new_file,
118
+ disabled=not is_api_key_provided,
119
  help="The file will be uploaded to Grobid, extracted the text and calculated "
120
  "embeddings of each paragraph which are then stored to a Db for be picked "
121
  "to answer specific questions. ")
 
136
 
137
  with st.sidebar:
138
  st.header("Documentation")
139
+ st.markdown("https://github.com/lfoppiano/document-qa")
140
+ st.markdown("""After entering your API Key (Open AI or Huggingface). Upload a scientific article as PDF document, click on the designated button and select the file from your device.""")
141
 
142
+ st.markdown(
143
  """After uploading, please wait for the PDF to be processed. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
144
 
145
  st.markdown("**Revision number**: [" + st.session_state[
146
  'git_rev'] + "](https://github.com/lfoppiano/grobid-magneto/commit/" + st.session_state['git_rev'] + ")")
147
 
148
  st.header("Query mode (Advanced use)")
149
+ st.markdown(
150
  """By default, the mode is set to LLM (Language Model) which enables question/answering. You can directly ask questions related to the PDF content, and the system will provide relevant answers.""")
151
 
152
+ st.markdown(
153
  """If you switch the mode to "Embedding," the system will return specific paragraphs from the document that are semantically similar to your query. This mode focuses on providing relevant excerpts rather than answering specific questions.""")
154
 
155
  if uploaded_file and not st.session_state.loaded_embeddings: