Belemort commited on
Commit
a18d1e2
1 Parent(s): 556e607

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -0
app.py CHANGED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from mistralai import Mistral
3
+ from langchain_community.tools import TavilySearchResults, JinaSearch
4
+ import concurrent.futures
5
+ import json
6
+ import os
7
+ import arxiv
8
+ import fitz # PyMuPDF
9
+ from docx import Document
10
+ from PIL import Image
11
+ import io
12
+ import base64
13
+ import mimetypes
14
+
15
+ # Set environment variables for Tavily API
16
+ os.environ["TAVILY_API_KEY"] = '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'
17
+
18
+ # Mistral client API keys
19
+ client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
20
+ client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
21
+ client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
22
+
23
+ # Function to encode images in base64
24
+ def encode_image_bytes(image_bytes):
25
+ return base64.b64encode(image_bytes).decode('utf-8')
26
+
27
+ # Functions to process various file types
28
+ def process_file(file_path):
29
+ mime_type, _ = mimetypes.guess_type(file_path)
30
+ if mime_type == 'application/pdf':
31
+ return process_pdf(file_path)
32
+ elif mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
33
+ return process_docx(file_path)
34
+ elif mime_type == 'text/plain':
35
+ return process_txt(file_path)
36
+ else:
37
+ print(f"Unsupported file type: {mime_type}")
38
+ return None, []
39
+
40
+ def process_pdf(file_path):
41
+ text = ""
42
+ images = []
43
+ pdf_document = fitz.open(file_path)
44
+ for page_num in range(len(pdf_document)):
45
+ text += pdf_document[page_num].get_text("text")
46
+ for _, img in enumerate(pdf_document.get_page_images(page_num, full=True)):
47
+ xref = img[0]
48
+ base_image = pdf_document.extract_image(xref)
49
+ image_bytes = base_image["image"]
50
+ image_ext = base_image["ext"]
51
+ base64_image = encode_image_bytes(image_bytes)
52
+ image_data = f"data:image/{image_ext};base64,{base64_image}"
53
+ images.append({"type": "image_url", "image_url": image_data})
54
+ return text, images
55
+
56
+ def process_docx(file_path):
57
+ doc = Document(file_path)
58
+ text = ""
59
+ images = []
60
+ for paragraph in doc.paragraphs:
61
+ text += paragraph.text + "\n"
62
+ for rel in doc.part.rels.values():
63
+ if "image" in rel.target_ref:
64
+ img_data = rel.target_part.blob
65
+ img = Image.open(io.BytesIO(img_data))
66
+ buffered = io.BytesIO()
67
+ img.save(buffered, format="JPEG")
68
+ image_base64 = encode_image_bytes(buffered.getvalue())
69
+ images.append({"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_base64}"})
70
+ return text, images
71
+
72
+ def process_txt(file_path):
73
+ with open(file_path, "r", encoding="utf-8") as file:
74
+ text = file.read()
75
+ return text, []
76
+
77
+ # Search setup function
78
+ def setup_search(question):
79
+ try:
80
+ tavily_tool = TavilySearchResults(max_results=20)
81
+ results = tavily_tool.invoke({"query": f"{question}"})
82
+ if isinstance(results, list):
83
+ return results, 'tavily_tool'
84
+ except Exception as e:
85
+ print("Error with TavilySearchResults:", e)
86
+ try:
87
+ jina_tool = JinaSearch()
88
+ results = json.loads(str(jina_tool.invoke({"query": f"{question}"})))
89
+ if isinstance(results, list):
90
+ return results, 'jina_tool'
91
+ except Exception as e:
92
+ print("Error with JinaSearch:", e)
93
+ return [], ''
94
+
95
+ # Function to extract key topics
96
+ def extract_key_topics(content, images=[]):
97
+ prompt = f"""
98
+ Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
99
+
100
+ ```{content}```
101
+
102
+ LIST IN ENGLISH:
103
+ -
104
+ """
105
+ message_content = [{"type": "text", "text": prompt}] + images
106
+ response = client_1.chat.complete(
107
+ model="pixtral-12b-2409",
108
+ messages=[{"role": "user", "content": message_content}]
109
+ )
110
+ return response.choices[0].message.content
111
+
112
+ def search_relevant_articles_arxiv(key_topics, max_articles=100):
113
+ articles_by_topic = {}
114
+ final_topics = []
115
+
116
+ def fetch_articles_for_topic(topic):
117
+ topic_articles = []
118
+ try:
119
+ # Fetch articles using arxiv.py based on the topic
120
+ search = arxiv.Search(
121
+ query=topic,
122
+ max_results=max_articles,
123
+ sort_by=arxiv.SortCriterion.Relevance
124
+ )
125
+ for result in search.results():
126
+ article_data = {
127
+ "title": result.title,
128
+ "doi": result.doi,
129
+ "summary": result.summary,
130
+ "url": result.entry_id,
131
+ "pdf_url": result.pdf_url
132
+ }
133
+ topic_articles.append(article_data)
134
+ final_topics.append(topic)
135
+ except Exception as e:
136
+ print(f"Error fetching articles for topic '{topic}': {e}")
137
+
138
+ return topic, topic_articles
139
+
140
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
141
+ # Use threads to fetch articles for each topic
142
+ futures = {executor.submit(fetch_articles_for_topic, topic): topic for topic in key_topics}
143
+ for future in concurrent.futures.as_completed(futures):
144
+ topic, articles = future.result()
145
+ if articles:
146
+ articles_by_topic[topic] = articles
147
+
148
+ return articles_by_topic, list(set(final_topics))
149
+
150
+ # Initialize process for text analysis
151
+ def init(content, images=[]):
152
+ key_topics = extract_key_topics(content, images)
153
+ key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
154
+ articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
155
+ result_json = json.dumps(articles_by_topic, indent=4)
156
+ return final_topics, result_json
157
+
158
+ # Summarization function
159
+ def process_article_for_summary(text, images=[], compression_percentage=30):
160
+ prompt = f"""
161
+ You are a commentator.
162
+ # article:
163
+ {text}
164
+
165
+ # Instructions:
166
+ ## Summarize:
167
+ In clear and concise language, summarize the key points and themes presented in the article by cutting it by {compression_percentage} percent in the markdown format.
168
+
169
+ """
170
+ message_content = [{"type": "text", "text": prompt}] + images
171
+ response = client_3.chat.complete(
172
+ model="pixtral-12b-2409",
173
+ messages=[{"role": "user", "content": message_content}]
174
+ )
175
+ return response.choices[0].message.content
176
+
177
+ # Question answering function
178
+ def ask_question_to_mistral(text, question, images=[]):
179
+ prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
180
+ message_content = [{"type": "text", "text": prompt}] + images
181
+ search_tool, tool = setup_search(question)
182
+ context = ''
183
+ if search_tool:
184
+ if tool == 'tavily_tool':
185
+ for result in search_tool:
186
+ context += f"{result.get('url', 'N/A')} : {result.get('content', 'No content')} \n"
187
+ elif tool == 'jina_tool':
188
+ for result in search_tool:
189
+ context += f"{result.get('link', 'N/A')} : {result.get('snippet', 'No snippet')} : {result.get('content', 'No content')} \n"
190
+ response = client_2.chat.complete(
191
+ model="pixtral-12b-2409",
192
+ messages=[{"role": "user", "content": f'{message_content}\n\nAdditional Context from Web Search:\n{context}'}]
193
+ )
194
+ return response.choices[0].message.content
195
+
196
+ # Gradio interface
197
+ def gradio_interface(file, task, question, compression_percentage):
198
+ if file:
199
+ text, images = process_file(file.name)
200
+ else:
201
+ text, images = "", []
202
+
203
+ topics, articles_json = init(text, images)
204
+
205
+ if task == "Summarization":
206
+ summary = process_article_for_summary(text, images, compression_percentage)
207
+ return {"Topics": topics, "Summary": summary, "Articles": articles_json}
208
+ elif task == "Question Answering":
209
+ if question:
210
+ answer = ask_question_to_mistral(text, question, images)
211
+ return {"Topics": topics, "Answer": answer, "Articles": articles_json}
212
+ else:
213
+ return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}
214
+
215
+ with gr.Blocks() as demo:
216
+ gr.Markdown("## Text Analysis: Summarization or Question Answering")
217
+ with gr.Row():
218
+ file_input = gr.File(label="Upload File")
219
+ task_choice = gr.Radio(["Summarization", "Question Answering"], label="Select Task")
220
+ question_input = gr.Textbox(label="Question (for Question Answering)", visible=False)
221
+ compression_input = gr.Slider(label="Compression Percentage (for Summarization)", minimum=10, maximum=90, value=30, visible=False)
222
+
223
+ task_choice.change(lambda choice: (gr.update(visible=choice == "Question Answering"),
224
+ gr.update(visible=choice == "Summarization")),
225
+ inputs=task_choice, outputs=[question_input, compression_input])
226
+
227
+ with gr.Row():
228
+ result_output = gr.JSON(label="Results")
229
+
230
+ submit_button = gr.Button("Submit")
231
+ submit_button.click(gradio_interface, [file_input, task_choice, question_input, compression_input], result_output)
232
+
233
+ demo.launch()