Belemort commited on
Commit
f23ad63
1 Parent(s): e139162

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -250
app.py CHANGED
@@ -5,21 +5,12 @@ import concurrent.futures
5
  import json
6
  import os
7
  import arxiv
 
8
  from PIL import Image
9
  import io
10
  import base64
11
- from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
12
- from langchain.text_splitter import CharacterTextSplitter
13
- from langchain_mistralai import ChatMistralAI
14
- from langchain.chains.combine_documents.stuff import StuffDocumentsChain
15
- from langchain.chains.llm import LLMChain
16
- from langchain_core.prompts import PromptTemplate
17
- from transformers import AutoTokenizer
18
- tokenizer = AutoTokenizer.from_pretrained("mistral-community/pixtral-12b")
19
 
20
- def count_tokens_in_text(text):
21
- tokens = tokenizer(text, return_tensors="pt", truncation=False, add_special_tokens=True)
22
- return len(tokens["input_ids"][0])
23
 
24
 
25
  # Set environment variables for Tavily API
@@ -29,8 +20,6 @@ os.environ["TAVILY_API_KEY"] = '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'
29
  client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
30
  client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
31
  client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
32
- api_key_4 = 'lCZWDjyQSEc5gJsATEcKjP9cCjWsB7lg'
33
- client_4 = ChatMistralAI(api_key=api_key_4, model="pixtral-12b-2409")
34
 
35
  # Function to encode images in base64
36
  def encode_image_bytes(image_bytes):
@@ -90,73 +79,6 @@ def extract_key_topics(content, images=[]):
90
  )
91
  return response.choices[0].message.content
92
 
93
- def extract_key_topics_with_large_text(content, images=[]):
94
- # Map prompt template for extracting key themes
95
- map_template = f"""
96
- Текст: {{docs}}
97
- Изображения: {{images}}
98
-
99
- Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
100
- LIST IN ENGLISH:
101
- -
102
-
103
- :"""
104
-
105
- map_prompt = PromptTemplate.from_template(map_template)
106
- map_chain = LLMChain(llm=client_4, prompt=map_prompt)
107
-
108
- # Reduce prompt template to further refine and extract key themes
109
- reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
110
- {{docs}}
111
-
112
- Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
113
- LIST IN ENGLISH:
114
- -
115
-
116
- :"""
117
-
118
- reduce_prompt = PromptTemplate.from_template(reduce_template)
119
- reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
120
-
121
- # Combine documents chain for Reduce step
122
- combine_documents_chain = StuffDocumentsChain(
123
- llm_chain=reduce_chain, document_variable_name="docs"
124
- )
125
-
126
- # ReduceDocumentsChain configuration
127
- reduce_documents_chain = ReduceDocumentsChain(
128
- combine_documents_chain=combine_documents_chain,
129
- collapse_documents_chain=combine_documents_chain,
130
- token_max=128000,
131
- )
132
-
133
- # MapReduceDocumentsChain combining Map and Reduce
134
- map_reduce_chain = MapReduceDocumentsChain(
135
- llm_chain=map_chain,
136
- reduce_documents_chain=reduce_documents_chain,
137
- document_variable_name="docs",
138
- return_intermediate_steps=False,
139
- )
140
-
141
- # Text splitter configuration
142
- text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
143
- tokenizer,
144
- chunk_size=100000,
145
- chunk_overlap=14000,
146
- )
147
-
148
- # Split the text into documents
149
- split_docs = text_splitter.create_documents([content])
150
-
151
- # Include image descriptions (optional, if required by the prompt)
152
- image_descriptions = "\n".join(
153
- [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
154
- )
155
-
156
- # Run the summarization chain to extract key themes
157
- key_topics = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
158
- return key_topics
159
-
160
  def search_relevant_articles_arxiv(key_topics, max_articles=100):
161
  articles_by_topic = {}
162
  final_topics = []
@@ -195,20 +117,13 @@ def search_relevant_articles_arxiv(key_topics, max_articles=100):
195
 
196
  return articles_by_topic, list(set(final_topics))
197
 
198
-
199
  def init(content, images=[]):
200
- if count_tokens_in_text(text=content) < 128_000:
201
- key_topics = extract_key_topics(content, images)
202
- key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
203
- articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
204
- result_json = json.dumps(articles_by_topic, indent=4)
205
- return final_topics, result_json
206
- else:
207
- key_topics = extract_key_topics_with_large_text(content, images)
208
- key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
209
- articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
210
- result_json = json.dumps(articles_by_topic, indent=4)
211
- return final_topics, result_json
212
 
213
  # Summarization function
214
  def process_article_for_summary(text, images=[], compression_percentage=30):
@@ -231,76 +146,6 @@ def process_article_for_summary(text, images=[], compression_percentage=30):
231
  )
232
  return response.choices[0].message.content
233
 
234
- def process_large_article_for_summary(text, images=[], compression_percentage=30):
235
- # Map prompt template
236
- map_template = f"""Следующий текст состоит из текста и изображений:
237
- Текст: {{docs}}
238
- Изображения: {{images}}
239
-
240
- На основе приведенного материала, выполните сжатие текста, выделяя основные темы и важные моменты.
241
- Уровень сжатия: {compression_percentage}%.
242
- Ответ предоставьте на русском языке в формате Markdown.
243
-
244
- Полезный ответ:"""
245
-
246
- map_prompt = PromptTemplate.from_template(map_template)
247
- map_chain = LLMChain(llm=client_4, prompt=map_prompt)
248
-
249
- # Reduce prompt template
250
- reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
251
- {{docs}}
252
-
253
- На основе этих кратких итогов, выполните финальное сжатие текста, объединяя основные темы и ключевые моменты.
254
- Уровень сжатия: {compression_percentage}%.
255
- Результат предоставьте на русском языке в формате Markdown.
256
-
257
- Полезный ответ:"""
258
-
259
- reduce_prompt = PromptTemplate.from_template(reduce_template)
260
- reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
261
-
262
- # Combine documents chain for Reduce step
263
- combine_documents_chain = StuffDocumentsChain(
264
- llm_chain=reduce_chain, document_variable_name="docs"
265
- )
266
-
267
- # ReduceDocumentsChain configuration
268
- reduce_documents_chain = ReduceDocumentsChain(
269
- combine_documents_chain=combine_documents_chain,
270
- collapse_documents_chain=combine_documents_chain,
271
- token_max=128000,
272
- )
273
-
274
- # MapReduceDocumentsChain combining Map and Reduce
275
- map_reduce_chain = MapReduceDocumentsChain(
276
- llm_chain=map_chain,
277
- reduce_documents_chain=reduce_documents_chain,
278
- document_variable_name="docs",
279
- return_intermediate_steps=False,
280
- )
281
-
282
- # Text splitter configuration
283
- text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
284
- tokenizer,
285
- chunk_size=100000,
286
- chunk_overlap=14000,
287
- )
288
-
289
- # Split the text into documents
290
- split_docs = text_splitter.create_documents([text])
291
- # Include image descriptions
292
- image_descriptions = "\n".join(
293
- [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
294
- )
295
-
296
- # Run the summarization chain
297
-
298
- with concurrent.futures.ThreadPoolExecutor() as executor:
299
- extract_future = executor.submit(init, text, images)
300
- summary = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
301
- key_topics , result_article_json = extract_future.result()
302
- return summary, key_topics, result_article_json
303
-
304
  # Question answering function
305
  def ask_question_to_mistral(text, question, images=[]):
306
  prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
@@ -324,100 +169,19 @@ def ask_question_to_mistral(text, question, images=[]):
324
  )
325
  return response.choices[0].message.content
326
 
327
- def ask_question_to_mistral_with_large_text(text, question, images=[]):
328
- # Prompts for QA
329
- map_template = """Следующий текст содержит статью/произведение:
330
- Текст: {{docs}}
331
- Изображения: {{images}}
332
- На основе приведенного текста, ответьте на следующий вопрос:
333
-
334
- Вопрос: {question}
335
-
336
- Ответ должен быть точным. Пожалуйста, ответьте на русском языке в формате Markdown.
337
-
338
- Полезный ответ:"""
339
-
340
- reduce_template = """Следующий текст содержит несколько кратких ответов на вопрос:
341
- {{docs}}
342
-
343
- Объедините их в финальный ответ. Ответ предоставьте на русском языке в формате Markdown.
344
-
345
- Полезный ответ:"""
346
-
347
- map_prompt = PromptTemplate.from_template(map_template)
348
- map_chain = LLMChain(llm=client_4, prompt=map_prompt)
349
-
350
- reduce_prompt = PromptTemplate.from_template(reduce_template)
351
- reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
352
-
353
- # Combine documents chain for Reduce step
354
- combine_documents_chain = StuffDocumentsChain(
355
- llm_chain=reduce_chain, document_variable_name="docs"
356
- )
357
-
358
- # ReduceDocumentsChain configuration
359
- reduce_documents_chain = ReduceDocumentsChain(
360
- combine_documents_chain=combine_documents_chain,
361
- collapse_documents_chain=combine_documents_chain,
362
- token_max=128000,
363
- )
364
-
365
- # MapReduceDocumentsChain combining Map and Reduce
366
- map_reduce_chain = MapReduceDocumentsChain(
367
- llm_chain=map_chain,
368
- reduce_documents_chain=reduce_documents_chain,
369
- document_variable_name="docs",
370
- return_intermediate_steps=False,
371
- )
372
-
373
- # Text splitter configuration
374
- text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
375
- tokenizer,
376
- chunk_size=100000,
377
- chunk_overlap=14000,
378
- )
379
-
380
- # Split the text into documents
381
- split_docs = text_splitter.create_documents([text])
382
-
383
- # Include image descriptions
384
- image_descriptions = "\n".join(
385
- [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
386
- )
387
-
388
- with concurrent.futures.ThreadPoolExecutor() as executor:
389
- extract_future = executor.submit(init, text, images)
390
- summary = map_reduce_chain.run({"input_documents": split_docs, "question": question , "images": image_descriptions})
391
- key_topics , result_article_json = extract_future.result()
392
- return summary, key_topics, result_article_json
393
-
394
-
395
  # Gradio interface
396
  def gradio_interface(text_input, images_base64, task, question, compression_percentage):
397
  text, images = process_input(text_input, images_base64)
398
 
399
- if task == "Summarization":
400
 
401
- if count_tokens_in_text(text=text) < 128_000:
402
- topics, articles_json = init(text, images)
403
- summary = process_article_for_summary(text, images, compression_percentage)
404
- return {"Topics": topics, "Summary": summary, "Articles": articles_json}
405
-
406
- else:
407
- summary , key_topics, result_article_json = process_large_article_for_summary(text, images, compression_percentage)
408
- return {"Topics": key_topics, "Summary": summary, "Articles": result_article_json}
409
-
410
  elif task == "Question Answering":
411
-
412
  if question:
413
-
414
- if count_tokens_in_text(text=text) < 128_000:
415
- topics, articles_json = init(text, images)
416
- answer = ask_question_to_mistral(text, question, images)
417
- return {"Topics": topics, "Answer": answer, "Articles": articles_json}
418
- else:
419
- summary , key_topics, result_article_json = ask_question_to_mistral_with_large_text(text, question, images)
420
- return {"Topics": key_topics, "Answer": answer, "Articles": result_article_json}
421
  else:
422
  return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}
423
 
 
5
  import json
6
  import os
7
  import arxiv
8
+ from docx import Document
9
  from PIL import Image
10
  import io
11
  import base64
 
 
 
 
 
 
 
 
12
 
13
+
 
 
14
 
15
 
16
  # Set environment variables for Tavily API
 
20
  client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
21
  client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
22
  client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
 
 
23
 
24
  # Function to encode images in base64
25
  def encode_image_bytes(image_bytes):
 
79
  )
80
  return response.choices[0].message.content
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def search_relevant_articles_arxiv(key_topics, max_articles=100):
83
  articles_by_topic = {}
84
  final_topics = []
 
117
 
118
  return articles_by_topic, list(set(final_topics))
119
 
120
+ # Initialize process for text analysis
121
  def init(content, images=[]):
122
+ key_topics = extract_key_topics(content, images)
123
+ key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
124
+ articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
125
+ result_json = json.dumps(articles_by_topic, indent=4)
126
+ return final_topics, result_json
 
 
 
 
 
 
 
127
 
128
  # Summarization function
129
  def process_article_for_summary(text, images=[], compression_percentage=30):
 
146
  )
147
  return response.choices[0].message.content
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # Question answering function
150
  def ask_question_to_mistral(text, question, images=[]):
151
  prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
 
169
  )
170
  return response.choices[0].message.content
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  # Gradio interface
173
  def gradio_interface(text_input, images_base64, task, question, compression_percentage):
174
  text, images = process_input(text_input, images_base64)
175
 
176
+ topics, articles_json = init(text, images)
177
 
178
+ if task == "Summarization":
179
+ summary = process_article_for_summary(text, images, compression_percentage)
180
+ return {"Topics": topics, "Summary": summary, "Articles": articles_json}
 
 
 
 
 
 
181
  elif task == "Question Answering":
 
182
  if question:
183
+ answer = ask_question_to_mistral(text, question, images)
184
+ return {"Topics": topics, "Answer": answer, "Articles": articles_json}
 
 
 
 
 
 
185
  else:
186
  return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}
187