Spaces:

Belemort
/

test_biocad

Sleeping

App Files Files Community

Belemort commited on 19 days ago

Commit

f23ad63

•

1 Parent(s): e139162

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -250

app.py CHANGED Viewed

@@ -5,21 +5,12 @@ import concurrent.futures
 import json
 import os
 import arxiv
 from PIL import Image
 import io
 import base64
-from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_mistralai import ChatMistralAI
-from langchain.chains.combine_documents.stuff import StuffDocumentsChain
-from langchain.chains.llm import LLMChain
-from langchain_core.prompts import PromptTemplate
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("mistral-community/pixtral-12b")
-def count_tokens_in_text(text):
-    tokens = tokenizer(text, return_tensors="pt", truncation=False, add_special_tokens=True)
-    return len(tokens["input_ids"][0])
 # Set environment variables for Tavily API
@@ -29,8 +20,6 @@ os.environ["TAVILY_API_KEY"] = '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'
 client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
 client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
 client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
-api_key_4 = 'lCZWDjyQSEc5gJsATEcKjP9cCjWsB7lg'
-client_4 = ChatMistralAI(api_key=api_key_4, model="pixtral-12b-2409")
 # Function to encode images in base64
 def encode_image_bytes(image_bytes):
@@ -90,73 +79,6 @@ def extract_key_topics(content, images=[]):
     )
     return response.choices[0].message.content
-def extract_key_topics_with_large_text(content, images=[]):
-    # Map prompt template for extracting key themes
-    map_template = f"""
-        Текст: {{docs}}
-        Изображения: {{images}}
-        Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
-        LIST IN ENGLISH:
-        -
-        :"""
-    map_prompt = PromptTemplate.from_template(map_template)
-    map_chain = LLMChain(llm=client_4, prompt=map_prompt)
-    # Reduce prompt template to further refine and extract key themes
-    reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
-        {{docs}}
-        Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
-        LIST IN ENGLISH:
-        -
-        :"""
-    reduce_prompt = PromptTemplate.from_template(reduce_template)
-    reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
-    # Combine documents chain for Reduce step
-    combine_documents_chain = StuffDocumentsChain(
-        llm_chain=reduce_chain, document_variable_name="docs"
-    )
-    # ReduceDocumentsChain configuration
-    reduce_documents_chain = ReduceDocumentsChain(
-        combine_documents_chain=combine_documents_chain,
-        collapse_documents_chain=combine_documents_chain,
-        token_max=128000,
-    )
-    # MapReduceDocumentsChain combining Map and Reduce
-    map_reduce_chain = MapReduceDocumentsChain(
-        llm_chain=map_chain,
-        reduce_documents_chain=reduce_documents_chain,
-        document_variable_name="docs",
-        return_intermediate_steps=False,
-    )
-    # Text splitter configuration
-    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
-        tokenizer,
-        chunk_size=100000,
-        chunk_overlap=14000,
-    )
-    # Split the text into documents
-    split_docs = text_splitter.create_documents([content])
-    # Include image descriptions (optional, if required by the prompt)
-    image_descriptions = "\n".join(
-        [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
-    )
-    # Run the summarization chain to extract key themes
-    key_topics = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
-    return key_topics
 def search_relevant_articles_arxiv(key_topics, max_articles=100):
     articles_by_topic = {}
     final_topics = []
@@ -195,20 +117,13 @@ def search_relevant_articles_arxiv(key_topics, max_articles=100):
     return articles_by_topic, list(set(final_topics))
 def init(content, images=[]):
-    if count_tokens_in_text(text=content) < 128_000:
-        key_topics = extract_key_topics(content, images)
-        key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
-        articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
-        result_json = json.dumps(articles_by_topic, indent=4)
-        return final_topics, result_json
-    else:
-        key_topics = extract_key_topics_with_large_text(content, images)
-        key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
-        articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
-        result_json = json.dumps(articles_by_topic, indent=4)
-        return final_topics, result_json
 # Summarization function
 def process_article_for_summary(text, images=[], compression_percentage=30):
@@ -231,76 +146,6 @@ def process_article_for_summary(text, images=[], compression_percentage=30):
     )
     return response.choices[0].message.content
-def process_large_article_for_summary(text, images=[], compression_percentage=30):
-    # Map prompt template
-    map_template = f"""Следующий текст состоит из текста и изображений:
-        Текст: {{docs}}
-        Изображения: {{images}}
-        На основе приведенного материала, выполните сжатие текста, выделяя основные темы и важные моменты.
-        Уровень сжатия: {compression_percentage}%.
-        Ответ предоставьте на русском языке в формате Markdown.
-        Полезный ответ:"""
-    map_prompt = PromptTemplate.from_template(map_template)
-    map_chain = LLMChain(llm=client_4, prompt=map_prompt)
-    # Reduce prompt template
-    reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
-        {{docs}}
-        На основе этих кратких итогов, выполните финальное сжатие текста, объединяя основные темы и ключевые моменты.
-        Уровень сжатия: {compression_percentage}%.
-        Результат предоставьте на русском языке в формате Markdown.
-        Полезный ответ:"""
-    reduce_prompt = PromptTemplate.from_template(reduce_template)
-    reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
-    # Combine documents chain for Reduce step
-    combine_documents_chain = StuffDocumentsChain(
-        llm_chain=reduce_chain, document_variable_name="docs"
-    )
-    # ReduceDocumentsChain configuration
-    reduce_documents_chain = ReduceDocumentsChain(
-        combine_documents_chain=combine_documents_chain,
-        collapse_documents_chain=combine_documents_chain,
-        token_max=128000,
-    )
-    # MapReduceDocumentsChain combining Map and Reduce
-    map_reduce_chain = MapReduceDocumentsChain(
-        llm_chain=map_chain,
-        reduce_documents_chain=reduce_documents_chain,
-        document_variable_name="docs",
-        return_intermediate_steps=False,
-    )
-    # Text splitter configuration
-    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
-        tokenizer,
-        chunk_size=100000,
-        chunk_overlap=14000,
-    )
-    # Split the text into documents
-    split_docs = text_splitter.create_documents([text])
-    # Include image descriptions
-    image_descriptions = "\n".join(
-        [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
-    )
-    # Run the summarization chain
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        extract_future = executor.submit(init, text, images)
-        summary = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
-        key_topics , result_article_json = extract_future.result()
-    return summary, key_topics, result_article_json
 # Question answering function
 def ask_question_to_mistral(text, question, images=[]):
     prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
@@ -324,100 +169,19 @@ def ask_question_to_mistral(text, question, images=[]):
     )
     return response.choices[0].message.content
-def ask_question_to_mistral_with_large_text(text, question, images=[]):
-    # Prompts for QA
-    map_template = """Следующий текст содержит статью/произведение:
-    Текст: {{docs}}
-    Изображения: {{images}}
-    На основе приведенного текста, ответьте на следующий вопрос:
-    Вопрос: {question}
-    Ответ должен быть точным. Пожалуйста, ответьте на русском языке в формате Markdown.
-    Полезный ответ:"""
-    reduce_template = """Следующий текст содержит несколько кратких ответов на вопрос:
-    {{docs}}
-    Объедините их в финальный ответ. Ответ предоставьте на русском языке в формате Markdown.
-    Полезный ответ:"""
-    map_prompt = PromptTemplate.from_template(map_template)
-    map_chain = LLMChain(llm=client_4, prompt=map_prompt)
-    reduce_prompt = PromptTemplate.from_template(reduce_template)
-    reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
-    # Combine documents chain for Reduce step
-    combine_documents_chain = StuffDocumentsChain(
-        llm_chain=reduce_chain, document_variable_name="docs"
-    )
-    # ReduceDocumentsChain configuration
-    reduce_documents_chain = ReduceDocumentsChain(
-        combine_documents_chain=combine_documents_chain,
-        collapse_documents_chain=combine_documents_chain,
-        token_max=128000,
-    )
-    # MapReduceDocumentsChain combining Map and Reduce
-    map_reduce_chain = MapReduceDocumentsChain(
-        llm_chain=map_chain,
-        reduce_documents_chain=reduce_documents_chain,
-        document_variable_name="docs",
-        return_intermediate_steps=False,
-    )
-    # Text splitter configuration
-    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
-        tokenizer,
-        chunk_size=100000,
-        chunk_overlap=14000,
-    )
-    # Split the text into documents
-    split_docs = text_splitter.create_documents([text])
-    # Include image descriptions
-    image_descriptions = "\n".join(
-        [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
-    )
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        extract_future = executor.submit(init, text, images)
-        summary = map_reduce_chain.run({"input_documents": split_docs, "question": question , "images": image_descriptions})
-        key_topics , result_article_json = extract_future.result()
-    return summary, key_topics, result_article_json
 # Gradio interface
 def gradio_interface(text_input, images_base64, task, question, compression_percentage):
     text, images = process_input(text_input, images_base64)
-    if task == "Summarization":
-        if count_tokens_in_text(text=text) < 128_000:
-            topics, articles_json = init(text, images)
-            summary = process_article_for_summary(text, images, compression_percentage)
-            return {"Topics": topics, "Summary": summary, "Articles": articles_json}
-        else:
-            summary , key_topics, result_article_json = process_large_article_for_summary(text, images, compression_percentage)
-            return {"Topics": key_topics, "Summary": summary, "Articles": result_article_json}
     elif task == "Question Answering":
         if question:
-            if count_tokens_in_text(text=text) < 128_000:
-                topics, articles_json = init(text, images)
-                answer = ask_question_to_mistral(text, question, images)
-                return {"Topics": topics, "Answer": answer, "Articles": articles_json}
-            else:
-                summary , key_topics, result_article_json  = ask_question_to_mistral_with_large_text(text, question, images)
-                return {"Topics": key_topics, "Answer": answer, "Articles": result_article_json}
         else:
             return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}

 import json
 import os
 import arxiv
+from docx import Document
 from PIL import Image
 import io
 import base64
 # Set environment variables for Tavily API
 client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
 client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
 client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
 # Function to encode images in base64
 def encode_image_bytes(image_bytes):
     )
     return response.choices[0].message.content
 def search_relevant_articles_arxiv(key_topics, max_articles=100):
     articles_by_topic = {}
     final_topics = []
     return articles_by_topic, list(set(final_topics))
+# Initialize process for text analysis
 def init(content, images=[]):
+    key_topics = extract_key_topics(content, images)
+    key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
+    articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
+    result_json = json.dumps(articles_by_topic, indent=4)
+    return final_topics, result_json
 # Summarization function
 def process_article_for_summary(text, images=[], compression_percentage=30):
     )
     return response.choices[0].message.content
 # Question answering function
 def ask_question_to_mistral(text, question, images=[]):
     prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
     )
     return response.choices[0].message.content
 # Gradio interface
 def gradio_interface(text_input, images_base64, task, question, compression_percentage):
     text, images = process_input(text_input, images_base64)
+    topics, articles_json = init(text, images)
+    if task == "Summarization":
+        summary = process_article_for_summary(text, images, compression_percentage)
+        return {"Topics": topics, "Summary": summary, "Articles": articles_json}
     elif task == "Question Answering":
         if question:
+            answer = ask_question_to_mistral(text, question, images)
+            return {"Topics": topics, "Answer": answer, "Articles": articles_json}
         else:
             return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}