Belemort commited on
Commit
b6529a0
1 Parent(s): e22cf9b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +627 -0
app.py ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from mistralai import Mistral
3
+ from langchain_community.tools import TavilySearchResults, JinaSearch
4
+ import concurrent.futures
5
+ import json
6
+ import os
7
+ import arxiv
8
+ from PIL import Image
9
+ import io
10
+ import base64
11
+ from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
12
+ from langchain.text_splitter import CharacterTextSplitter
13
+ from langchain_mistralai import ChatMistralAI
14
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
15
+ from langchain.chains.llm import LLMChain
16
+ from langchain_core.prompts import PromptTemplate
17
+ from json_repair import repair_json
18
+ from transformers import AutoTokenizer
19
+ tokenizer = AutoTokenizer.from_pretrained("mistral-community/pixtral-12b")
20
+
21
+ def count_tokens_in_text(text):
22
+ tokens = tokenizer(text, return_tensors="pt", truncation=False, add_special_tokens=True)
23
+ return len(tokens["input_ids"][0])
24
+
25
+ # Set environment variables for Tavily API
26
+ os.environ["TAVILY_API_KEY"] = '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'
27
+
28
+ # Mistral client API keys
29
+ client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
30
+ client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
31
+ client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
32
+ api_key_4 = 'aYls8aj48SOEov8AY1dwp4hr07MsCRFb'
33
+ client_4 = ChatMistralAI(api_key=api_key_4, model="pixtral-12b-2409")
34
+
35
+ # Function to encode images in base64
36
+ def encode_image_bytes(image_bytes):
37
+ return base64.b64encode(image_bytes).decode('utf-8')
38
+
39
+ # Function to decode base64 images
40
+ def decode_base64_image(base64_str):
41
+ image_data = base64.b64decode(base64_str)
42
+ return Image.open(io.BytesIO(image_data))
43
+
44
+ # Process text and images provided by the user
45
+ def process_input(text_input, images_base64):
46
+ images = []
47
+ if images_base64:
48
+ for img_data in images_base64:
49
+ try:
50
+ img = decode_base64_image(img_data)
51
+ buffered = io.BytesIO()
52
+ img.save(buffered, format="JPEG")
53
+ image_base64 = encode_image_bytes(buffered.getvalue())
54
+ images.append({"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_base64}"})
55
+ except Exception as e:
56
+ print(f"Error decoding image: {e}")
57
+
58
+ return text_input, images
59
+
60
+ # Search setup function
61
+ def setup_search(question):
62
+ try:
63
+ tavily_tool = TavilySearchResults(max_results=20)
64
+ results = tavily_tool.invoke({"query": f"{question}"})
65
+ if isinstance(results, list):
66
+ return results, 'tavily_tool'
67
+ except Exception as e:
68
+ print("Error with TavilySearchResults:", e)
69
+ try:
70
+ jina_tool = JinaSearch()
71
+ results = json.loads(str(jina_tool.invoke({"query": f"{question}"})))
72
+ if isinstance(results, list):
73
+ return results, 'jina_tool'
74
+ except Exception as e:
75
+ print("Error with JinaSearch:", e)
76
+ return [], ''
77
+
78
+
79
+
80
+ def lit_obr(text , crit):
81
+
82
+ api_key = 'vjOgcQPigpidK7njWV5jPidP69CHg5Yg'
83
+ model = "pixtral-12b-2409"
84
+ client = Mistral(api_key=api_key)
85
+ client_4 = ChatMistralAI(api_key=api_key, model=model)
86
+
87
+ def count_tokens_in_text(text):
88
+ tokens = tokenizer(text, return_tensors="pt", truncation=False, add_special_tokens=True)
89
+ return len(tokens["input_ids"][0])
90
+
91
+ prom = """
92
+ #####
93
+ # Выведи итог строго в формате JSON. Убедись, что:
94
+ # - JSON является валидным и имеет правильную вложенность.
95
+ # - Все строки (ключи и значения) заключены в двойные кавычки.
96
+ # - Нет лишних запятых.
97
+ # - Используй формат структуры, приведённой ниже.
98
+ #####
99
+ {"comparison_table": {"markdown": "| article title | criterion name 1 | criterion name 2 | criterion name 3 |\n|---------------|------------------|------------------|------------------|\n| article title 1 | result | result | result |\n| article title 2 | result | result | result |\n| article title 3 | result | result | result |"},
100
+ "quotes": {
101
+ "criterion name 1": {
102
+ "article title 1": "citation",
103
+ "article title 2": "citation",
104
+ "article title 3": "citation"
105
+ },
106
+ "criterion name 2": {
107
+ "article title 1": "citation",
108
+ "article title 2": "citation",
109
+ "article title 3": "citation"
110
+ },
111
+ "criterion name 3": {
112
+ "article title 1": "citation",
113
+ "article title 2": "citation",
114
+ "article title 3": "citation"
115
+ }
116
+ },
117
+ "conclusion": "result"
118
+ }
119
+ #####
120
+ # Убедись, что:
121
+ # - Поле "comparison_table.markdown" содержит корректно отформатированную таблицу с заголовками и данными.
122
+ # - Поля "quotes" содержат цитаты по указанным критериям для каждой статьи.
123
+ # - Поле "conclusion" включает краткое заключение о сравнении статей.
124
+ #
125
+ # Если есть неуверенность, уточни формат или структуру перед генерацией.
126
+ #####
127
+ """
128
+
129
+ def process_scientific_articles_for_analysis_1(text, criter_prompts=""):
130
+ promt = f"""
131
+ Analyze scientific articles based on the criteria provided by the user. Extract relevant data from the text and present a concise comparative review.
132
+ Provide a brief literature review in the following format as a table, including article titles (not their indices) in the comparison row.
133
+
134
+ Represent the comparison in the form of a table, where:
135
+
136
+ - The first vertical column contains the titles of the articles in a shortened form without losing their meaning, strictly as text, and without indices.
137
+ - Subsequent columns contain concise information for each criterion, formulated based on the text of the article. The information should be brief but capture the essence without directly copying the text.
138
+
139
+ Additionally, below the table, provide full quotes from the text that confirm the data presented in the table:
140
+ - Each quote should be presented without any changes or interpretation.
141
+ - Quotes must be in the original language of the article.
142
+ - Group quotes by articles: start with the article title, followed by the quotes for each criterion.
143
+
144
+ Ensure the output is clear and useful.
145
+
146
+ Result requirements:
147
+ - The table should only contain concise information extracted from the text in the cells.
148
+ - Full quotes must be provided separately, below the table.
149
+ - Do not include author names or publication dates in the quotes.
150
+ - Both the concise data and the quotes should be presented in the language in which the articles are written.
151
+
152
+ Start numbering the articles from the first, excluding zero.
153
+
154
+ Input data:
155
+ Articles:
156
+ {text}
157
+
158
+ Criteria:
159
+ {criter_prompts}
160
+
161
+ Result format:
162
+ {prom}
163
+ """
164
+
165
+
166
+
167
+
168
+ chat_response = client.chat.complete(
169
+ model=model,
170
+ messages= [{ "role": "user", "content": [{ "type": "text", "text": promt}] }]
171
+ )
172
+
173
+ return chat_response.choices[0].message.content
174
+
175
+ def process_scientific_articles_for_analysis_2(text, images=[], criter_prompts=""):
176
+ map_template = f"""
177
+ {{docs}}
178
+ Analyze the scientific articles based on the criteria provided by the user. Extract the relevant data from the text and present a concise comparative review.
179
+ Provide a summary literature review in the following format as a table, including the article titles (not their indices) in the comparison row.
180
+
181
+ Present the comparisons in the form of a table where:
182
+
183
+ The first vertical column lists the titles of the articles, shortened without losing their meaning, and in no other format.
184
+ Subsequent columns represent the parameters provided below.
185
+ Rows contain concise quotes extracted from the text.
186
+ Additionally, below the table, provide direct quotes from the text without any summarization or changes that confirm the data presented in the table. These quotes must consist only of sentences from the text, excluding publication dates and author names. If no data is available, state "No data available." Present each quote on a separate line under the corresponding criterion in the table, group the quotes by article, and include the article titles (not indices). Write the quotes in the language in which they appear in the text.
187
+
188
+ Start numbering the articles from the first (excluding zero). Do not include the authors or publication dates of the articles in the quotes, do not number each quote line, but present each quote on a new line:
189
+ {{criter_prompts}}
190
+
191
+ Give a brief literature review in the following format:
192
+ Provide the following JSON structure:
193
+ {{comparison_table}}
194
+ """
195
+
196
+ reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
197
+ {{docs}}
198
+
199
+ На основе этих кратких итогов, проведи анализ научных статей по введенным критериям, объединяя основные данные и выводя обобщающий литературный обзор.
200
+ Выведи результат в следующем формате:
201
+
202
+ 1. Таблица, где:
203
+ - Первая колонка по вертикали — это названия статей (сокращенные без потери смысла).
204
+ - Последующие колонки — это критерии анализа.
205
+ - Строки содержат краткие данные по тексту каждой статьи, соответствующие критериям.
206
+
207
+ 2. Под таблицей укажи прямые цитаты из текста, подтверждающие данные в таблице. Каждую цитату:
208
+ - Группируй по статьям.
209
+ - Пиши на языке оригинала текста.
210
+ - Не включай авторов и даты написания статьи.
211
+ - Если данных нет, укажи "Данных нет".
212
+
213
+ Обязательно предоставь полезный и четкий вывод.
214
+ Результат:
215
+
216
+ Приведи краткий обзор литературы в следующем формате:
217
+ {{comparison_table}}
218
+ """
219
+
220
+
221
+ map_prompt = PromptTemplate.from_template(map_template)
222
+ map_chain = LLMChain(llm=client_4, prompt=map_prompt)
223
+
224
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
225
+ reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
226
+
227
+ combine_documents_chain = StuffDocumentsChain(
228
+ llm_chain=reduce_chain, document_variable_name="docs"
229
+ )
230
+
231
+ reduce_documents_chain = ReduceDocumentsChain(
232
+ combine_documents_chain=combine_documents_chain,
233
+ collapse_documents_chain=combine_documents_chain,
234
+ token_max=128000,
235
+ )
236
+
237
+ map_reduce_chain = MapReduceDocumentsChain(
238
+ llm_chain=map_chain,
239
+ reduce_documents_chain=reduce_documents_chain,
240
+ document_variable_name="docs",
241
+ return_intermediate_steps=False,
242
+ )
243
+
244
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
245
+ tokenizer,
246
+ chunk_size=100000,
247
+ chunk_overlap=14000,
248
+ )
249
+
250
+ split_docs = text_splitter.create_documents([text])
251
+
252
+ image_descriptions = "\n".join(
253
+ [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
254
+ )
255
+
256
+ result = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions, "comparison_table": prom, 'criter_prompts': criter_prompts})
257
+ return result
258
+
259
+ def init(text_data, criter):
260
+
261
+ if count_tokens_in_text(text_data) < 128000:
262
+ rezult = process_scientific_articles_for_analysis_1(text_data, criter)
263
+ else:
264
+ rezult = process_scientific_articles_for_analysis_2(text_data, criter_prompts = criter)
265
+
266
+ return json.loads(repair_json(rezult[7:-4])) #repair_json(rezult[7:-4])
267
+
268
+ return init(text , crit)
269
+
270
+
271
+ # Function to extract key topics
272
+ def extract_key_topics(content, images=[]):
273
+ prompt = f"""
274
+ Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
275
+ ```{content}```
276
+ LIST IN ENGLISH:
277
+ -
278
+ """
279
+ message_content = [{"type": "text", "text": prompt}] + list(images)
280
+ response = client_1.chat.complete(
281
+ model="pixtral-12b-2409",
282
+ messages=[{"role": "user", "content": message_content}]
283
+ )
284
+ return response.choices[0].message.content
285
+
286
+ def extract_key_topics_with_large_text(content, images=[]):
287
+ # Map prompt template for extracting key themes
288
+ map_template = f"""
289
+ Текст: {{docs}}
290
+ Изображения: {{images}}
291
+ Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
292
+ LIST IN ENGLISH:
293
+ -
294
+ :"""
295
+
296
+ map_prompt = PromptTemplate.from_template(map_template)
297
+ map_chain = LLMChain(llm=client_4, prompt=map_prompt)
298
+
299
+ # Reduce prompt template to further refine and extract key themes
300
+ reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
301
+ {{docs}}
302
+ Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
303
+ LIST IN ENGLISH:
304
+ -
305
+ :"""
306
+
307
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
308
+ reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
309
+
310
+ # Combine documents chain for Reduce step
311
+ combine_documents_chain = StuffDocumentsChain(
312
+ llm_chain=reduce_chain, document_variable_name="docs"
313
+ )
314
+
315
+ # ReduceDocumentsChain configuration
316
+ reduce_documents_chain = ReduceDocumentsChain(
317
+ combine_documents_chain=combine_documents_chain,
318
+ collapse_documents_chain=combine_documents_chain,
319
+ token_max=128000,
320
+ )
321
+
322
+ # MapReduceDocumentsChain combining Map and Reduce
323
+ map_reduce_chain = MapReduceDocumentsChain(
324
+ llm_chain=map_chain,
325
+ reduce_documents_chain=reduce_documents_chain,
326
+ document_variable_name="docs",
327
+ return_intermediate_steps=False,
328
+ )
329
+
330
+ # Text splitter configuration
331
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
332
+ tokenizer,
333
+ chunk_size=100000,
334
+ chunk_overlap=14000,
335
+ )
336
+
337
+ # Split the text into documents
338
+ split_docs = text_splitter.create_documents([content])
339
+
340
+ # Include image descriptions (optional, if required by the prompt)
341
+ image_descriptions = "\n".join(
342
+ [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
343
+ )
344
+
345
+ # Run the summarization chain to extract key themes
346
+ key_topics = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
347
+ return key_topics
348
+
349
+ def search_relevant_articles_arxiv(key_topics, max_articles=10):
350
+ articles_by_topic = {}
351
+ final_topics = []
352
+
353
+ def fetch_articles_for_topic(topic):
354
+ topic_articles = []
355
+ try:
356
+ # Fetch articles using arxiv.py based on the topic
357
+ search = arxiv.Search(
358
+ query=topic,
359
+ max_results=max_articles,
360
+ sort_by=arxiv.SortCriterion.Relevance
361
+ )
362
+ for result in search.results():
363
+ article_data = {
364
+ "title": result.title,
365
+ "doi": result.doi,
366
+ "summary": result.summary,
367
+ "url": result.entry_id,
368
+ "pdf_url": result.pdf_url
369
+ }
370
+ topic_articles.append(article_data)
371
+ final_topics.append(topic)
372
+ except Exception as e:
373
+ print(f"Error fetching articles for topic '{topic}': {e}")
374
+
375
+ return topic, topic_articles
376
+
377
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
378
+ # Use threads to fetch articles for each topic
379
+ futures = {executor.submit(fetch_articles_for_topic, topic): topic for topic in key_topics}
380
+ for future in concurrent.futures.as_completed(futures):
381
+ topic, articles = future.result()
382
+ if articles:
383
+ articles_by_topic[topic] = articles
384
+
385
+ return articles_by_topic, list(set(final_topics))
386
+
387
+ def init(content, images=[]):
388
+ if count_tokens_in_text(text=content) < 128_000:
389
+ key_topics = extract_key_topics(content, images)
390
+ key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
391
+ articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
392
+ result_json = json.dumps(articles_by_topic, indent=4)
393
+ return final_topics, result_json
394
+ else:
395
+ key_topics = extract_key_topics_with_large_text(content, images)
396
+ key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
397
+ articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
398
+ result_json = json.dumps(articles_by_topic, indent=4)
399
+ return final_topics, result_json
400
+
401
+ def process_article_for_summary(text, images=[], compression_percentage=30):
402
+ prompt = f"""
403
+ You are a commentator.
404
+ # article:
405
+ {text}
406
+ # Instructions:
407
+ ## Summarize IN RUSSIAN:
408
+ In clear and concise language, summarize the key points and themes presented in the article by cutting it by {compression_percentage} percent.
409
+ """
410
+
411
+ if len(images) >= 8 :
412
+ images = images[:7]
413
+
414
+ message_content = [{"type": "text", "text": prompt}] + images
415
+ response = client_3.chat.complete(
416
+ model="pixtral-12b-2409",
417
+ messages=[{"role": "user", "content": message_content}]
418
+ )
419
+ return response.choices[0].message.content
420
+
421
+ def process_large_article_for_summary(text, images=[], compression_percentage=30):
422
+ # Map prompt template
423
+ map_template = f"""Следующий текст состоит из текста и изображений:
424
+ Текст: {{docs}}
425
+ Изображения: {{images}}
426
+ На основе приведенного материала, выполните сжатие текста, выделяя основные темы и важные моменты.
427
+ Уровень сжатия: {compression_percentage}%.
428
+ Ответ предоставьте на русском языке в формате Markdown.
429
+ Полезный ответ:"""
430
+
431
+ map_prompt = PromptTemplate.from_template(map_template)
432
+ map_chain = LLMChain(llm=client_4, prompt=map_prompt)
433
+
434
+ # Reduce prompt template
435
+ reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
436
+ {{docs}}
437
+ На основе этих кратких итогов, выполните финальное сжатие текста, объединяя основные темы и ключевые моменты.
438
+ Уровень сжатия: {compression_percentage}%.
439
+ Результат предоставьте на русском языке в формате Markdown.
440
+ Полезный ответ:"""
441
+
442
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
443
+ reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
444
+
445
+ # Combine documents chain for Reduce step
446
+ combine_documents_chain = StuffDocumentsChain(
447
+ llm_chain=reduce_chain, document_variable_name="docs"
448
+ )
449
+
450
+ # ReduceDocumentsChain configuration
451
+ reduce_documents_chain = ReduceDocumentsChain(
452
+ combine_documents_chain=combine_documents_chain,
453
+ collapse_documents_chain=combine_documents_chain,
454
+ token_max=128000,
455
+ )
456
+
457
+ # MapReduceDocumentsChain combining Map and Reduce
458
+ map_reduce_chain = MapReduceDocumentsChain(
459
+ llm_chain=map_chain,
460
+ reduce_documents_chain=reduce_documents_chain,
461
+ document_variable_name="docs",
462
+ return_intermediate_steps=False,
463
+ )
464
+
465
+ # Text splitter configuration
466
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
467
+ tokenizer,
468
+ chunk_size=100000,
469
+ chunk_overlap=14000,
470
+ )
471
+
472
+ # Split the text into documents
473
+ split_docs = text_splitter.create_documents([text])
474
+ # Include image descriptions
475
+ image_descriptions = "\n".join(
476
+ [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
477
+ )
478
+
479
+ # Run the summarization chain
480
+ summary = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
481
+ return summary
482
+
483
+ def ask_question_to_mistral(text, question, context , images=[]):
484
+ prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
485
+
486
+ if len(images) >= 8 :
487
+ images = images[:7]
488
+
489
+ message_content = [{"type": "text", "text": prompt}] + images
490
+ response = client_2.chat.complete(
491
+ model="pixtral-12b-2409",
492
+ messages=[{"role": "user", "content": f'{message_content}\n\nAdditional Context from Web Search:\n{context}'}]
493
+ )
494
+ return response.choices[0].message.content
495
+
496
+ def ask_question_to_mistral_with_large_text(text, question, context , images=[]):
497
+ # Prompts for QA
498
+ map_template = """Следующий текст содержит статью/произведение:
499
+ Текст: {docs}
500
+ Изображения: {{images}}
501
+ На основе приведенного текста, ответьте на следующий вопрос:
502
+ Вопрос: {{question}}
503
+ Ответ должен быть точным. Пожалуйста, ответьте на русском языке в формате Markdown.
504
+ Информация из интернета: {{context}}
505
+ Полезный ответ:"""
506
+
507
+ reduce_template = """Следующий текст содержит несколько кратких ответов на вопрос:
508
+ {docs}
509
+ Объедините их в финальный ответ. Ответ предоставьте на русском языке в формате Markdown.
510
+ Полезный ответ:"""
511
+
512
+ map_prompt = PromptTemplate.from_template(map_template)
513
+ map_chain = LLMChain(llm=client_4, prompt=map_prompt)
514
+
515
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
516
+ reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
517
+
518
+ # Combine documents chain for Reduce step
519
+ combine_documents_chain = StuffDocumentsChain(
520
+ llm_chain=reduce_chain, document_variable_name="docs"
521
+ )
522
+
523
+ # ReduceDocumentsChain configuration
524
+ reduce_documents_chain = ReduceDocumentsChain(
525
+ combine_documents_chain=combine_documents_chain,
526
+ collapse_documents_chain=combine_documents_chain,
527
+ token_max=128000,
528
+ )
529
+
530
+ # MapReduceDocumentsChain combining Map and Reduce
531
+ map_reduce_chain = MapReduceDocumentsChain(
532
+ llm_chain=map_chain,
533
+ reduce_documents_chain=reduce_documents_chain,
534
+ document_variable_name="docs",
535
+ return_intermediate_steps=False,
536
+ )
537
+
538
+ # Text splitter configuration
539
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
540
+ tokenizer,
541
+ chunk_size=100000,
542
+ chunk_overlap=14000,
543
+ )
544
+
545
+ # Split the text into documents
546
+ split_docs = text_splitter.create_documents([text])
547
+
548
+ # Include image descriptions
549
+ image_descriptions = "\n".join(
550
+ [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
551
+ )
552
+
553
+ answer = map_reduce_chain.run({"input_documents": split_docs, "question": question , 'context': context , "images": image_descriptions})
554
+ return answer
555
+
556
+ def gradio_interface(text_input, images_base64, task, question, crit, compression_percentage):
557
+ text, images = process_input(text_input, images_base64)
558
+
559
+ if task == "Summarization":
560
+
561
+ if count_tokens_in_text(text=text) < 128_000:
562
+ summary = process_article_for_summary(text, images, compression_percentage)
563
+ return {"Summary": summary }
564
+
565
+ else:
566
+ summary= process_large_article_for_summary(text, images, compression_percentage)
567
+ return {"Summary": summary, }
568
+
569
+ elif task == "Question Answering":
570
+
571
+ if question:
572
+
573
+ search_tool, tool = setup_search(question)
574
+ context = ''
575
+ if search_tool:
576
+ if tool == 'tavily_tool':
577
+ for result in search_tool:
578
+ context += f"{result.get('url', 'N/A')} : {result.get('content', 'No content')} \n"
579
+ elif tool == 'jina_tool':
580
+ for result in search_tool:
581
+ context += f"{result.get('link', 'N/A')} : {result.get('snippet', 'No snippet')} : {result.get('content', 'No content')} \n"
582
+
583
+
584
+ if count_tokens_in_text(text + context) < 128_000:
585
+ answer = ask_question_to_mistral(text, question, context , images)
586
+ return {"Answer": answer }
587
+ else:
588
+ answer = ask_question_to_mistral_with_large_text(text, question, context , images)
589
+ return {"Answer": answer}
590
+ else:
591
+ return {"Answer": "No question provided." }
592
+
593
+ elif task == 'Search Article' :
594
+ return init(text , images_base64)
595
+
596
+ elif task == 'Lit Obzor' :
597
+ return lit_obr(text , crit)
598
+
599
+
600
+ with gr.Blocks() as demo:
601
+ gr.Markdown("## Text Analysis: Summarization or Question Answering")
602
+
603
+ with gr.Row():
604
+ text_input = gr.Textbox(label="Input Text")
605
+ images_base64 = gr.Textbox(label="Base64 Images (comma-separated, if any)", placeholder="data:image/jpeg;base64,...", lines=2)
606
+ task_choice = gr.Radio(["Summarization", "Question Answering", "Search Article", "Lit Obzor"], label="Select Task")
607
+ question_input = gr.Textbox(label="Question (for Question Answering)", visible=False)
608
+ lit_crit = gr.Textbox(label="Критерии для лит обзора", visible=False, placeholder="Введите критерии для литературного обзора.")
609
+ compression_input = gr.Slider(label="Compression Percentage (for Summarization)", minimum=10, maximum=90, value=30, visible=False)
610
+
611
+ # Скрытие или отображение компонентов в зависимости от выбора задачи
612
+ task_choice.change(lambda choice: (
613
+ gr.update(visible=choice == "Question Answering"), # For question input visibility
614
+ gr.update(visible=choice == "Summarization"), # For compression percentage visibility
615
+ gr.update(visible=choice == "Lit Obzor") # For literary review criteria visibility
616
+ ), inputs=task_choice, outputs=[question_input, compression_input, lit_crit])
617
+
618
+
619
+ with gr.Row():
620
+ result_output = gr.JSON(label="Results")
621
+
622
+ submit_button = gr.Button("Submit")
623
+ submit_button.click(gradio_interface,
624
+ inputs=[text_input, images_base64, task_choice, question_input, lit_crit, compression_input],
625
+ outputs=result_output)
626
+
627
+ demo.launch(show_error=True)