mahynski commited on
Commit
a487212
·
1 Parent(s): 183da17

degbu embeddings

Browse files
Files changed (1) hide show
  1. app.py +151 -147
app.py CHANGED
@@ -14,158 +14,162 @@ from llama_index.core import (
14
  Settings,
15
  )
16
 
17
- # os.environ["OPENAI_API_KEY"] = "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"
18
 
19
  from llama_parse import LlamaParse
20
 
21
  from streamlit_pdf_viewer import pdf_viewer
22
 
23
- # Global configurations
24
- from llama_index.core import set_global_handler
25
- set_global_handler("langfuse")
26
- st.set_page_config(layout="wide")
27
-
28
- with st.sidebar:
29
- st.title('Document Summarization and QA System')
30
- # st.markdown('''
31
- # ## About this application
32
- # Upload a pdf to ask questions about it. This retrieval-augmented generation (RAG) workflow uses:
33
- # - [Streamlit](https://streamlit.io/)
34
- # - [LlamaIndex](https://docs.llamaindex.ai/en/stable/)
35
- # - [OpenAI](https://platform.openai.com/docs/models)
36
- # ''')
37
-
38
- # st.write('Made by ***Nate Mahynski***')
39
- # st.write('nathan.mahynski@nist.gov')
40
-
41
- # Select Provider
42
- provider = st.selectbox(
43
- label="Select LLM Provider",
44
- options=['google', 'huggingface', 'mistralai', 'openai'],
45
- index=0
46
- )
47
-
48
- # Select LLM
49
- if provider == 'google':
50
- llm_list = ['gemini']
51
- elif provider == 'huggingface':
52
- llm_list = []
53
- elif provider == 'mistralai':
54
- llm_list =[]
55
- elif provider == 'openai':
56
- llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
57
- else:
58
- llm_list = []
59
-
60
- llm_name = st.selectbox(
61
- label="Select LLM Model",
62
- options=llm_list,
63
- index=0
64
- )
65
-
66
- # Temperature
67
- temperature = st.slider(
68
- "Temperature",
69
- min_value=0.0,
70
- max_value=1.0,
71
- value=0.0,
72
- step=0.05,
73
- )
74
-
75
- max_output_tokens = 4096
76
-
77
- # Enter LLM Token
78
- llm_token = st.text_input(
79
- "Enter your LLM token",
80
- value="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"
81
- )
82
-
83
- # Create LLM
84
- if llm_token is not None:
85
- if provider == 'openai':
86
- os.environ["OPENAI_API_KEY"] = str(llm_token)
87
- Settings.llm = OpenAI(
88
- model=llm_name,
89
- temperature=temperature,
90
- max_tokens=max_tokens,
91
- api_key=os.environ.get("OPENAI_API_KEY")
92
- )
93
- # Global tokenization needs to be consistent with LLM
94
- # https://docs.llamaindex.ai/en/stable/module_guides/models/llms/
95
- Settings.tokenizer = tiktoken.encoding_for_model(llm_name).encode
96
- Settings.num_output = max_tokens
97
- Settings.context_window = 4096 # max possible
98
- Settings.embed_model = OpenAIEmbedding(api_key=os.environ.get("OPENAI_API_KEY"))
99
  elif provider == 'huggingface':
100
- os.environ['HFTOKEN'] = str(llm_token)
101
-
102
- # Enter parsing Token
103
- parse_token = st.text_input(
104
- "Enter your LlamaParse token",
105
- value="llx-uxxwLr1gZmDibaHTl99ISQJtpLSjjfhgDvnosGxu92RdRlb7"
106
- )
107
-
108
- uploaded_file = st.file_uploader(
109
- "Choose a PDF file to upload",
110
- # type=['pdf'],
111
- accept_multiple_files=False
112
- )
113
-
114
- parsed_document = None
115
- if uploaded_file is not None:
116
- # Parse the file
117
- parser = LlamaParse(
118
- api_key=parse_token, # can also be set in your env as LLAMA_CLOUD_API_KEY
119
- result_type="text" # "markdown" and "text" are available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
 
122
- # Create a temporary directory to save the file then load and parse it
123
- temp_dir = tempfile.TemporaryDirectory()
124
- temp_filename = os.path.join(temp_dir.name, uploaded_file.name)
125
- with open(temp_filename, "wb") as f:
126
- f.write(uploaded_file.getvalue())
127
- parsed_document = parser.load_data(temp_filename)
128
- temp_dir.cleanup()
129
-
130
- col1, col2 = st.columns(2)
131
-
132
- with col1:
133
- st.markdown(
134
- """
135
- # Instructions
136
-
137
- 1. Obtain a [token](https://cloud.llamaindex.ai/api-key) (or API Key) from LlamaParse to parse your document.
138
- 2. Obtain a similar token from your preferred LLM provider.
139
- 3. Make selections at the left and upload a document to use a context.
140
- 4. Begin asking questions below!
141
- """
142
- )
143
-
144
- st.divider()
145
-
146
- index = VectorStoreIndex.from_documents(parsed_document)
147
- query_engine = index.as_query_engine()
148
-
149
- prompt_txt = 'Summarize this document in a 3-5 sentences.'
150
- prompt = st.text_area(
151
- label="Enter you query.",
152
- key="prompt_widget",
153
- value=prompt_txt
154
- )
155
-
156
- response = query_engine.query(prompt)
157
- st.write(response.response)
158
-
159
- with col2:
160
- tab1, tab2 = st.tabs(["Uploaded File", "Parsed File",])
161
-
162
- with tab1:
163
- # st.header('This is the raw file you uploaded.')
164
- if uploaded_file is not None: # Display the pdf
165
- bytes_data = uploaded_file.getvalue()
166
- pdf_viewer(input=bytes_data, width=700)
167
-
168
- with tab2:
169
- # st.header('This is the parsed version of the file.')
170
- if parsed_document is not None: # Showed the raw parsing result
171
- st.write(parsed_document)
 
14
  Settings,
15
  )
16
 
17
+ os.environ["OPENAI_API_KEY"] = "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"
18
 
19
  from llama_parse import LlamaParse
20
 
21
  from streamlit_pdf_viewer import pdf_viewer
22
 
23
+ def main():
24
+ with st.sidebar:
25
+ st.title('Document Summarization and QA System')
26
+ # st.markdown('''
27
+ # ## About this application
28
+ # Upload a pdf to ask questions about it. This retrieval-augmented generation (RAG) workflow uses:
29
+ # - [Streamlit](https://streamlit.io/)
30
+ # - [LlamaIndex](https://docs.llamaindex.ai/en/stable/)
31
+ # - [OpenAI](https://platform.openai.com/docs/models)
32
+ # ''')
33
+
34
+ # st.write('Made by ***Nate Mahynski***')
35
+ # st.write('nathan.mahynski@nist.gov')
36
+
37
+ # Select Provider
38
+ provider = st.selectbox(
39
+ label="Select LLM Provider",
40
+ options=['google', 'huggingface', 'mistralai', 'openai'],
41
+ index=0
42
+ )
43
+
44
+ # Select LLM
45
+ if provider == 'google':
46
+ llm_list = ['gemini']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  elif provider == 'huggingface':
48
+ llm_list = []
49
+ elif provider == 'mistralai':
50
+ llm_list =[]
51
+ elif provider == 'openai':
52
+ llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
53
+ else:
54
+ llm_list = []
55
+
56
+ llm_name = st.selectbox(
57
+ label="Select LLM Model",
58
+ options=llm_list,
59
+ index=0
60
+ )
61
+
62
+ # Temperature
63
+ temperature = st.slider(
64
+ "Temperature",
65
+ min_value=0.0,
66
+ max_value=1.0,
67
+ value=0.0,
68
+ step=0.05,
69
+ )
70
+
71
+ max_output_tokens = 4096
72
+
73
+ # Enter LLM Token
74
+ llm_token = st.text_input(
75
+ "Enter your LLM token",
76
+ value="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"
77
+ )
78
+
79
+ # Create LLM
80
+ if llm_token is not None:
81
+ if provider == 'openai':
82
+ os.environ["OPENAI_API_KEY"] = str(llm_token)
83
+ Settings.llm = OpenAI(
84
+ model=llm_name,
85
+ temperature=temperature,
86
+ max_tokens=max_tokens,
87
+ api_key=os.environ.get("OPENAI_API_KEY")
88
+ )
89
+ # Global tokenization needs to be consistent with LLM
90
+ # https://docs.llamaindex.ai/en/stable/module_guides/models/llms/
91
+ Settings.tokenizer = tiktoken.encoding_for_model(llm_name).encode
92
+ Settings.num_output = max_tokens
93
+ Settings.context_window = 4096 # max possible
94
+ Settings.embed_model = OpenAIEmbedding(api_key=os.environ.get("OPENAI_API_KEY"))
95
+ elif provider == 'huggingface':
96
+ os.environ['HFTOKEN'] = str(llm_token)
97
+
98
+ # Enter parsing Token
99
+ parse_token = st.text_input(
100
+ "Enter your LlamaParse token",
101
+ value="llx-uxxwLr1gZmDibaHTl99ISQJtpLSjjfhgDvnosGxu92RdRlb7"
102
+ )
103
+
104
+ uploaded_file = st.file_uploader(
105
+ "Choose a PDF file to upload",
106
+ # type=['pdf'],
107
+ accept_multiple_files=False
108
+ )
109
+
110
+ parsed_document = None
111
+ if uploaded_file is not None:
112
+ # Parse the file
113
+ parser = LlamaParse(
114
+ api_key=parse_token, # can also be set in your env as LLAMA_CLOUD_API_KEY
115
+ result_type="text" # "markdown" and "text" are available
116
+ )
117
+
118
+ # Create a temporary directory to save the file then load and parse it
119
+ temp_dir = tempfile.TemporaryDirectory()
120
+ temp_filename = os.path.join(temp_dir.name, uploaded_file.name)
121
+ with open(temp_filename, "wb") as f:
122
+ f.write(uploaded_file.getvalue())
123
+ parsed_document = parser.load_data(temp_filename)
124
+ temp_dir.cleanup()
125
+
126
+ col1, col2 = st.columns(2)
127
+
128
+ with col1:
129
+ st.markdown(
130
+ """
131
+ # Instructions
132
+
133
+ 1. Obtain a [token](https://cloud.llamaindex.ai/api-key) (or API Key) from LlamaParse to parse your document.
134
+ 2. Obtain a similar token from your preferred LLM provider.
135
+ 3. Make selections at the left and upload a document to use a context.
136
+ 4. Begin asking questions below!
137
+ """
138
+ )
139
+
140
+ st.divider()
141
+
142
+ index = VectorStoreIndex.from_documents(parsed_document)
143
+ query_engine = index.as_query_engine()
144
+
145
+ prompt_txt = 'Summarize this document in a 3-5 sentences.'
146
+ prompt = st.text_area(
147
+ label="Enter you query.",
148
+ key="prompt_widget",
149
+ value=prompt_txt
150
  )
151
 
152
+ response = query_engine.query(prompt)
153
+ st.write(response.response)
154
+
155
+ with col2:
156
+ tab1, tab2 = st.tabs(["Uploaded File", "Parsed File",])
157
+
158
+ with tab1:
159
+ # st.header('This is the raw file you uploaded.')
160
+ if uploaded_file is not None: # Display the pdf
161
+ bytes_data = uploaded_file.getvalue()
162
+ pdf_viewer(input=bytes_data, width=700)
163
+
164
+ with tab2:
165
+ # st.header('This is the parsed version of the file.')
166
+ if parsed_document is not None: # Showed the raw parsing result
167
+ st.write(parsed_document)
168
+
169
+ if __name__ == '__main__':
170
+ # Global configurations
171
+ from llama_index.core import set_global_handler
172
+ set_global_handler("langfuse")
173
+ st.set_page_config(layout="wide")
174
+
175
+ main()