Nvidia-Embed-V1

Build error

App Files Files Community

Tonic commited on May 29

Commit

5fda074

•

1 Parent(s): 54f1ed7

update demo forNV1bed

Browse files

Files changed (23) hide show

.github/workflows/publish.yml +0 -29
.github/workflows/tests.yml +0 -34
README.md +3 -3
app.py +64 -179
benchmark.py +0 -226
data/.gitignore +0 -6
data/examples/0.md +0 -5
data/examples/0.png +0 -0
data/examples/100.md +0 -1
data/examples/100.png +0 -0
data/examples/300.md +0 -4
data/examples/300.png +0 -0
data/examples/400.md +0 -9
data/examples/400.png +0 -0
data/images/gui_screen.png +0 -0
data/images/texify_bench.png +0 -0
ocr_app.py +0 -167
ocr_image.py +0 -67
poetry.lock +0 -0
pyproject.toml +0 -47
requirements.txt +3 -2
run_ocr_app.py +0 -8
scripts/verify_benchmark_scores.py +0 -20

.github/workflows/publish.yml DELETED Viewed

@@ -1,29 +0,0 @@
-name: Python package
-on:
-  push:
-    tags:
-      - "v*.*.*"
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python 3.11
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Install python dependencies
-        run: |
-          pip install poetry
-          poetry install
-          poetry remove torch
-          poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
-      - name: Build package
-        run: |
-          poetry build
-      - name: Publish package
-        env:
-          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-        run: |
-          poetry config pypi-token.pypi "$PYPI_TOKEN"
-          poetry publish

.github/workflows/tests.yml DELETED Viewed

@@ -1,34 +0,0 @@
-name: Integration test
-on: [push]
-env:
-  TORCH_DEVICE: "cpu"
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python 3.11
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Install python dependencies
-        run: |
-          pip install poetry
-          poetry install
-          poetry remove torch
-          poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
-      - name: Download benchmark data
-        run: |
-          wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1dbY0kBq2SUa885gmbLPUWSRzy5K7O5XJ"
-          unzip benchmark_data.zip
-          mv bench_data.json data/bench_data.json
-      - name: Run benchmark test
-        run: |
-          poetry run texify_benchmark --max 16
-          poetry run python scripts/verify_benchmark_scores.py data/bench_results.json

README.md CHANGED Viewed

@@ -1,9 +1,9 @@
 ---
 license: mit
-title: Tonic's e5
 sdk: gradio
-emoji: 🐣🛌🏻🤗
-colorFrom: red
 colorTo: green
 pinned: true
 app_file: app.py

 ---
 license: mit
+title: Tonic's NV-Embed
 sdk: gradio
+emoji: n📽️n🛌🏻
+colorFrom: pink
 colorTo: green
 pinned: true
 app_file: app.py

app.py CHANGED Viewed

@@ -6,16 +6,18 @@ import threading
 import queue
 import gradio as gr
 import os
 title = """
-# 👋🏻Welcome to 🙋🏻‍♂️Tonic's 🐣e5-mistral🛌🏻Embeddings """
 description = """
-You can use this ZeroGPU Space to test out the current model [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct). 🐣e5-mistral🛌🏻 has a larger context🪟window, a different prompting/return🛠️mechanism and generally better results than other embedding models. use it via API to create embeddings or try out the sentence similarity to see how various optimization parameters affect performance.
-You can also use 🐣e5-mistral🛌🏻 by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/e5?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
-Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻  [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to 🌟 [DataTonic](https://github.com/Tonic-AI/DataTonic) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
-os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tasks = {
         'ArguAna': 'Given a claim, find documents that refute the claim',
@@ -31,17 +33,45 @@ tasks = {
         'SCIDOCS': 'Given a scientific paper title, retrieve paper abstracts that are cited by the given paper',
         'SciFact': 'Given a scientific claim, retrieve documents that support or refute the claim',
         'Touche2020': 'Given a question, retrieve detailed and persuasive arguments that answer the question',
-        'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
 }
-# Global queue for embedding requests
 embedding_request_queue = queue.Queue()
 embedding_response_queue = queue.Queue()
-tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
-model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct', torch_dtype=torch.float16, device_map=device)
 def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
     left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
@@ -52,18 +82,22 @@ def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tenso
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
-def clear_cuda_cache():
-    torch.cuda.empty_cache()
-def free_memory(*args):
-    for arg in args:
-        del arg
-def load_corpus_from_json(file_path):
-    with open(file_path, 'r') as file:
-        data = json.load(file)
-    return data
 def embedding_worker():
     while True:
@@ -79,14 +113,13 @@ def embedding_worker():
         embedding_request_queue.task_done()
         clear_cuda_cache()
-threading.Thread(target=embedding_worker, daemon=True).start()
 def compute_embeddings(selected_task, input_text):
     try:
         task_description = tasks[selected_task]
     except KeyError:
         print(f"Selected task not found: {selected_task}")
         return f"Error: Task '{selected_task}' not found. Please select a valid task."
     max_length = 2048
     processed_texts = [f'Instruct: {task_description}\nQuery: {input_text}']
@@ -101,124 +134,42 @@ def compute_embeddings(selected_task, input_text):
     clear_cuda_cache()
     return embeddings_list
-def decode_embedding(embedding_str):
-    try:
-        embedding = [float(num) for num in embedding_str.split(',')]
-        embedding_tensor = torch.tensor(embedding, dtype=torch.float16, device=device)
-        decoded_embedding = tokenizer.decode(embedding_tensor[0], skip_special_tokens=True)
-        return decoded_embedding.cpu().numpy().tolist()
-    except Exception as e:
-        return f"Error in decoding: {str(e)}"
 def compute_similarity(selected_task, sentence1, sentence2, extra_sentence1, extra_sentence2):
     try:
         task_description = tasks[selected_task]
     except KeyError:
         print(f"Selected task not found: {selected_task}")
         return f"Error: Task '{selected_task}' not found. Please select a valid task."
     # Compute embeddings for each sentence
     embeddings1 = compute_embeddings(selected_task, sentence1)
     embeddings2 = compute_embeddings(selected_task, sentence2)
     embeddings3 = compute_embeddings(selected_task, extra_sentence1)
     embeddings4 = compute_embeddings(selected_task, extra_sentence2)
-    # Convert embeddings to tensors
-    embeddings_tensor1 = torch.tensor(embeddings1).to(device).half()
-    embeddings_tensor2 = torch.tensor(embeddings2).to(device).half()
-    embeddings_tensor3 = torch.tensor(embeddings3).to(device).half()
-    embeddings_tensor4 = torch.tensor(embeddings4).to(device).half()
-    # Compute cosine similarity
     similarity1 = compute_cosine_similarity(embeddings1, embeddings2)
     similarity2 = compute_cosine_similarity(embeddings1, embeddings3)
     similarity3 = compute_cosine_similarity(embeddings1, embeddings4)
-    # Free memory
-    free_memory(embeddings1, embeddings2, embeddings3, embeddings4)
     similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
     clear_cuda_cache()
     return similarity_scores
 def compute_cosine_similarity(emb1, emb2):
     tensor1 = torch.tensor(emb1).to(device).half()
     tensor2 = torch.tensor(emb2).to(device).half()
     similarity = F.cosine_similarity(tensor1, tensor2).item()
-    free_memory(tensor1, tensor2)
     clear_cuda_cache()
     return similarity
-def compute_embeddings_batch(input_texts):
-    max_length = 2042
-    processed_texts = [f'Instruct: {task_description}\nQuery: {text}' for text in input_texts]
-    batch_dict = tokenizer(processed_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
-    batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
-    batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
-    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
-    outputs = model(**batch_dict)
-    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
-    embeddings = F.normalize(embeddings, p=2, dim=1)
-    clear_cuda_cache()
-    return embeddings.detach().cpu().numpy()
-def semantic_search(query_embedding, corpus_embeddings, top_k=5):
-    scores = np.dot(corpus_embeddings, query_embedding.T).flatten()
-    top_k_indices = np.argsort(scores)[::-1][:top_k]
-    return top_k_indices, scores[top_k_indices]
-def search_similar_sentences(input_question, corpus_sentences, corpus_embeddings):
-    question_embedding = compute_embeddings_batch([input_question])[0]
-    top_k_indices, top_k_scores = semantic_search(question_embedding, corpus_embeddings)
-    results = [(corpus_sentences[i], top_k_scores[i]) for i in top_k_indices]
-    return results
-# openai response object formatting
-def format_response(embeddings):
-    return {
-        "data": [
-            {
-                "embedding": embeddings,
-                "index": 0,
-                "object": "embedding"
-            }
-        ],
-        "model": "e5-mistral",
-        "object": "list",
-        "usage": {
-            "prompt_tokens": 17,
-            "total_tokens": 17
-        }
-    }
-def generate_and_format_embeddings(selected_task, input_text):
-    embedding_request_queue.put((selected_task, input_text))
-    response = embedding_response_queue.get()
-    embedding_response_queue.task_done()
-    clear_cuda_cache()
-    return response
 def app_interface():
-    corpus_sentences = []
-    corpus_embeddings = []
     with gr.Blocks() as demo:
         gr.Markdown(title)
         gr.Markdown(description)
         with gr.Row():
             task_dropdown = gr.Dropdown(list(tasks.keys()), label="Select a Task", value=list(tasks.keys())[0])
-        with gr.Tab("Embedding Generation"):
-            input_text_box = gr.Textbox(label="📖Input Text")
-            compute_button = gr.Button("Try🐣🛌🏻e5")
-            output_display = gr.Textbox(label="🐣e5-mistral🛌🏻 Embeddings")
-            compute_button.click(
-                fn=compute_embeddings,
-                inputs=[task_dropdown, input_text_box],
-                outputs=output_display
-            )
         with gr.Tab("Sentence Similarity"):
             sentence1_box = gr.Textbox(label="'Focus Sentence' - The 'Subject'")
             sentence2_box = gr.Textbox(label="'Input Sentence' - 1")
@@ -226,83 +177,17 @@ def app_interface():
             extra_sentence2_box = gr.Textbox(label="'Input Sentence' - 3")
             similarity_button = gr.Button("Compute Similarity")
             similarity_output = gr.Textbox(label="🐣e5-mistral🛌🏻 Similarity Scores")
             similarity_button.click(
                 fn=compute_similarity,
                 inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
                 outputs=similarity_output
             )
-        with gr.Tab("Load Corpus"):
-            json_uploader = gr.File(label="Upload JSON File")
-            load_corpus_button = gr.Button("Load Corpus")
-            corpus_status = gr.Textbox(label="Corpus Status", value="Corpus not loaded")
-            def load_corpus(file_info):
-                if file_info is None:
-                    return "No file uploaded. Please upload a JSON file."
-                try:
-                    global corpus_sentences, corpus_embeddings
-                    corpus_sentences = load_corpus_from_json(file_info['name'])
-                    corpus_embeddings = compute_embeddings_batch(corpus_sentences)
-                    return "Corpus loaded successfully with {} sentences.".format(len(corpus_sentences))
-                except Exception as e:
-                    return "Error loading corpus: {}".format(e)
-            load_corpus_button.click(
-                fn=load_corpus,
-                inputs=json_uploader,
-                outputs=corpus_status
-            )
-        with gr.Tab("Semantic Search"):
-            input_question_box = gr.Textbox(label="Enter your question")
-            search_button = gr.Button("Search")
-            search_results_output = gr.Textbox(label="Search Results")
-            def perform_search(input_question):
-                if not corpus_sentences or not corpus_embeddings:
-                    return "Corpus is not loaded. Please load a corpus first."
-                return search_similar_sentences(input_question, corpus_sentences, corpus_embeddings)
-            search_button.click(
-                fn=perform_search,
-                inputs=input_question_box,
-                outputs=search_results_output
-            )
-        with gr.Tab("Connector-like Embeddings"):
-            with gr.Row():
-                input_text_box_connector = gr.Textbox(label="Input Text", placeholder="Enter text or array of texts")
-                model_dropdown_connector = gr.Dropdown(label="Model", choices=["ArguAna", "ClimateFEVER", "DBPedia", "FEVER", "FiQA2018", "HotpotQA", "MSMARCO", "NFCorpus", "NQ", "QuoraRetrieval", "SCIDOCS", "SciFact", "Touche2020", "TRECCOVID"], value="text-embedding-ada-002")
-                encoding_format_connector = gr.Radio(label="Encoding Format", choices=["float", "base64"], value="float")
-                user_connector = gr.Textbox(label="User", placeholder="Enter user identifier (optional)")
-                submit_button_connector = gr.Button("Generate Embeddings")
-            output_display_connector = gr.JSON(label="Embeddings Output")
-            submit_button_connector.click(
-                fn=generate_and_format_embeddings,
-                inputs=[model_dropdown_connector, input_text_box_connector],
-                outputs=output_display_connector
-            )
-#       with gr.Tab("Decode Embedding"):
-#           embedding_input = gr.Textbox(label="Enter Embedding (comma-separated floats)")
-#           decode_button = gr.Button("Decode")
-#           decoded_output = gr.Textbox(label="Decoded Embedding")
-#
-#           decode_button.click(
-#               fn=decode_embedding,
-#               inputs=embedding_input,
-#               outputs=decoded_output
-#           )
-        with gr.Row():
-            with gr.Column():
-                input_text_box
-            with gr.Column():
-                compute_button
-                output_display
     return demo
 app_interface().queue()
 app_interface().launch(share=True)

 import queue
 import gradio as gr
 import os
+import json
+import numpy as np
 title = """
+# 👋🏻Welcome to 🙋🏻‍♂️Tonic's 📽️Nvidia 🛌🏻Embed V-1 !"""
 description = """
+You can use this Space to test out the current model [nvidia/NV-Embed-v1](https://huggingface.co/nvidia/NV-Embed-v1). 🐣a generalist embedding model that ranks No. 1 on the Massive Text Embedding Benchmark (MTEB benchmark)(as of May 24, 2024), with 56 tasks, encompassing retrieval, reranking, classification, clustering, and semantic textual similarity tasks.
+You can also use 📽️Nvidia 🛌🏻Embed V-1 by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/NV-Embed?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
+Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻  [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to 🌟 [MultiTonic](https://github.com/MultiTonic) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
 tasks = {
         'ArguAna': 'Given a claim, find documents that refute the claim',
         'SCIDOCS': 'Given a scientific paper title, retrieve paper abstracts that are cited by the given paper',
         'SciFact': 'Given a scientific claim, retrieve documents that support or refute the claim',
         'Touche2020': 'Given a question, retrieve detailed and persuasive arguments that answer the question',
+        'Natural Language Inference' : 'Retrieve semantically similar text',
+        'Natural Language Inference' : 'Given a premise, retrieve a hypothesis that is entailed by the premise 20k',
+        'PAQ, MSMARCO' : 'Given a web search query, retrieve relevant passages that answer the query',
+        'PAQ, MSMARCO' : 'Given a question, retrieve passages that answer the question',
+        'SQUAD' : 'Given a question, retrieve Wikipedia passages that answer the question' ,
+        'StackExchange' : 'Given a question paragraph at StackExchange, retrieve a question duplicated paragraph',
+        'Natural Question' : 'Given a question, retrieve Wikipedia passages that answer the question',
+        'BioASQ' : 'Given a question, retrieve detailed question descriptions that are duplicates to the given question',
+        'STS12, STS22, STSBenchmark' : 'Retrieve semantically similar text.',
+        'AmazonCounterfactualClassification' : 'Classify a given Amazon customer review text as either counterfactual or not-counterfactual' ,
+        'AmazonReviewsClassification' : 'Classify the given Amazon review into its appropriate rating category' ,
+        'Banking77Classification' : 'Given a online banking query, find the corresponding intents',
+        'EmotionClassification' : 'Classify the emotion expressed in the given Twitter message into one of the six emotions:anger, fear, joy, love, sadness, and surprise',
+        'ImdbClassification': 'Classify the sentiment expressed in the given movie review text from the IMDB dataset',
+        'MTOPIntentClassification' : 'Classify the intent of the given utterance in task-oriented conversation',
+        'ToxicConversationsClassification' : 'Classify the given comments as either toxic or not toxic',
+        'TweetSentimentExtractionClassification' : 'Classify the sentiment of a given tweet as either positive, negative, or neutral',
+        'ArxivClusteringP2P' : 'Identify the main and secondary category of Arxiv papers based on the titles and abstracts',
+        'ArxivClusteringS2S' : 'Identify the main and secondary category of Arxiv papers based on the titles',
+        'BiorxivClusteringP2P' : 'Identify the main category of Biorxiv papers based on the titles and abstracts' ,
+        'BiorxivClusteringS2S' : 'Identify the main category of Biorxiv papers based on the titles',
+        'MedrxivClusteringP2P' : 'Identify the main category of Medrxiv papers based on the titles and abstracts',
+        'MedrxivClusteringS2S' : 'Identify the main category of Medrxiv papers based on the titles',
+        'TwentyNewsgroupsClustering' : 'Identify the topic or theme of the given news articles'
 }
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Define the model and tokenizer globally
+tokenizer = AutoTokenizer.from_pretrained('nvidia/NV-Embed-v1', trust_remote_code=True)
+model = AutoModel.from_pretrained('nvidia/NV-Embed-v1', trust_remote_code=True).to(device)
+# Embedding requests and response queues
 embedding_request_queue = queue.Queue()
 embedding_response_queue = queue.Queue()
+def clear_cuda_cache():
+    torch.cuda.empty_cache()
 def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
     left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+def format_response(embeddings):
+    return {
+        "data": [
+            {
+                "embedding": embeddings,
+                "index": 0,
+                "object": "embedding"
+            }
+        ],
+        "model": "e5-mistral",
+        "object": "list",
+        "usage": {
+            "prompt_tokens": 17,
+            "total_tokens": 17
+        }
+    }
 def embedding_worker():
     while True:
         embedding_request_queue.task_done()
         clear_cuda_cache()
 def compute_embeddings(selected_task, input_text):
     try:
         task_description = tasks[selected_task]
     except KeyError:
         print(f"Selected task not found: {selected_task}")
         return f"Error: Task '{selected_task}' not found. Please select a valid task."
     max_length = 2048
     processed_texts = [f'Instruct: {task_description}\nQuery: {input_text}']
     clear_cuda_cache()
     return embeddings_list
 def compute_similarity(selected_task, sentence1, sentence2, extra_sentence1, extra_sentence2):
     try:
         task_description = tasks[selected_task]
     except KeyError:
         print(f"Selected task not found: {selected_task}")
         return f"Error: Task '{selected_task}' not found. Please select a valid task."
     # Compute embeddings for each sentence
     embeddings1 = compute_embeddings(selected_task, sentence1)
     embeddings2 = compute_embeddings(selected_task, sentence2)
     embeddings3 = compute_embeddings(selected_task, extra_sentence1)
     embeddings4 = compute_embeddings(selected_task, extra_sentence2)
     similarity1 = compute_cosine_similarity(embeddings1, embeddings2)
     similarity2 = compute_cosine_similarity(embeddings1, embeddings3)
     similarity3 = compute_cosine_similarity(embeddings1, embeddings4)
     similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
     clear_cuda_cache()
     return similarity_scores
 def compute_cosine_similarity(emb1, emb2):
     tensor1 = torch.tensor(emb1).to(device).half()
     tensor2 = torch.tensor(emb2).to(device).half()
     similarity = F.cosine_similarity(tensor1, tensor2).item()
     clear_cuda_cache()
     return similarity
 def app_interface():
     with gr.Blocks() as demo:
         gr.Markdown(title)
         gr.Markdown(description)
         with gr.Row():
             task_dropdown = gr.Dropdown(list(tasks.keys()), label="Select a Task", value=list(tasks.keys())[0])
         with gr.Tab("Sentence Similarity"):
             sentence1_box = gr.Textbox(label="'Focus Sentence' - The 'Subject'")
             sentence2_box = gr.Textbox(label="'Input Sentence' - 1")
             extra_sentence2_box = gr.Textbox(label="'Input Sentence' - 3")
             similarity_button = gr.Button("Compute Similarity")
             similarity_output = gr.Textbox(label="🐣e5-mistral🛌🏻 Similarity Scores")
             similarity_button.click(
                 fn=compute_similarity,
                 inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
                 outputs=similarity_output
             )
     return demo
+embedding_worker_thread = threading.Thread(target=embedding_worker, daemon=True)
+embedding_worker_thread.start()
 app_interface().queue()
 app_interface().launch(share=True)

benchmark.py DELETED Viewed

@@ -1,226 +0,0 @@
-import argparse
-import os.path
-import random
-import time
-from functools import partial
-import evaluate
-from tabulate import tabulate
-from tqdm import tqdm
-from texify.inference import batch_inference
-from texify.model.model import load_model
-from texify.model.processor import load_processor
-from PIL import Image
-from texify.settings import settings
-import json
-import base64
-import io
-from rapidfuzz.distance import Levenshtein
-def normalize_text(text):
-    # Replace fences
-    text = text.replace("$", "")
-    text = text.replace("\[", "")
-    text = text.replace("\]", "")
-    text = text.replace("\(", "")
-    text = text.replace("\)", "")
-    text = text.strip()
-    return text
-def score_text(predictions, references):
-    bleu = evaluate.load("bleu")
-    bleu_results = bleu.compute(predictions=predictions, references=references)
-    meteor = evaluate.load('meteor')
-    meteor_results = meteor.compute(predictions=predictions, references=references)
-    lev_dist = []
-    for p, r in zip(predictions, references):
-        lev_dist.append(Levenshtein.normalized_distance(p, r))
-    return {
-        'bleu': bleu_results["bleu"],
-        'meteor': meteor_results['meteor'],
-        'edit': sum(lev_dist) / len(lev_dist)
-    }
-def image_to_pil(image):
-    decoded = base64.b64decode(image)
-    return Image.open(io.BytesIO(decoded))
-def load_images(source_data):
-    images = [sd["image"] for sd in source_data]
-    images = [image_to_pil(image) for image in images]
-    return images
-def inference_texify(source_data, model, processor):
-    images = load_images(source_data)
-    write_data = []
-    for i in tqdm(range(0, len(images), settings.BATCH_SIZE), desc="Texify inference"):
-        batch = images[i:i+settings.BATCH_SIZE]
-        text = batch_inference(batch, model, processor)
-        for j, t in enumerate(text):
-            eq_idx = i + j
-            write_data.append({"text": t, "equation": source_data[eq_idx]["equation"]})
-    return write_data
-def inference_pix2tex(source_data):
-    from pix2tex.cli import LatexOCR
-    model = LatexOCR()
-    images = load_images(source_data)
-    write_data = []
-    for i in tqdm(range(len(images)), desc="Pix2tex inference"):
-        try:
-            text = model(images[i])
-        except ValueError:
-            # Happens when resize fails
-            text = ""
-        write_data.append({"text": text, "equation": source_data[i]["equation"]})
-    return write_data
-def image_to_bmp(image):
-    img_out = io.BytesIO()
-    image.save(img_out, format="BMP")
-    return img_out
-def inference_nougat(source_data, batch_size=1):
-    import torch
-    from nougat.postprocessing import markdown_compatible
-    from nougat.utils.checkpoint import get_checkpoint
-    from nougat.utils.dataset import ImageDataset
-    from nougat.utils.device import move_to_device
-    from nougat import NougatModel
-    # Load images, then convert to bmp format for nougat
-    images = load_images(source_data)
-    images = [image_to_bmp(image) for image in images]
-    predictions = []
-    ckpt = get_checkpoint(None, model_tag="0.1.0-small")
-    model = NougatModel.from_pretrained(ckpt)
-    if settings.TORCH_DEVICE_MODEL != "cpu":
-        move_to_device(model, bf16=settings.CUDA, cuda=settings.CUDA)
-    model.eval()
-    dataset = ImageDataset(
-        images,
-        partial(model.encoder.prepare_input, random_padding=False),
-    )
-    # Batch sizes higher than 1 explode memory usage on CPU/MPS
-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=batch_size,
-        pin_memory=True,
-        shuffle=False,
-    )
-    for idx, sample in tqdm(enumerate(dataloader), desc="Nougat inference", total=len(dataloader)):
-        model.config.max_length = settings.MAX_TOKENS
-        model_output = model.inference(image_tensors=sample, early_stopping=False)
-        output = [markdown_compatible(o) for o in model_output["predictions"]]
-        predictions.extend(output)
-    return predictions
-def main():
-    parser = argparse.ArgumentParser(description="Benchmark the performance of texify.")
-    parser.add_argument("--data_path", type=str, help="Path to JSON file with source images/equations", default=os.path.join(settings.DATA_DIR, "bench_data.json"))
-    parser.add_argument("--result_path", type=str, help="Path to JSON file to save results to.", default=os.path.join(settings.DATA_DIR, "bench_results.json"))
-    parser.add_argument("--max", type=int, help="Maximum number of images to benchmark.", default=None)
-    parser.add_argument("--pix2tex", action="store_true", help="Run pix2tex scoring", default=False)
-    parser.add_argument("--nougat", action="store_true", help="Run nougat scoring", default=False)
-    args = parser.parse_args()
-    source_path = os.path.abspath(args.data_path)
-    result_path = os.path.abspath(args.result_path)
-    os.makedirs(os.path.dirname(result_path), exist_ok=True)
-    model = load_model()
-    processor = load_processor()
-    with open(source_path, "r") as f:
-        source_data = json.load(f)
-    if args.max:
-        random.seed(1)
-        source_data = random.sample(source_data, args.max)
-    start = time.time()
-    predictions = inference_texify(source_data, model, processor)
-    times = {"texify": time.time() - start}
-    text = [normalize_text(p["text"]) for p in predictions]
-    references = [normalize_text(p["equation"]) for p in predictions]
-    scores = score_text(text, references)
-    write_data = {
-        "texify": {
-            "scores": scores,
-            "text": [{"prediction": p, "reference": r} for p, r in zip(text, references)]
-        }
-    }
-    if args.pix2tex:
-        start = time.time()
-        predictions = inference_pix2tex(source_data)
-        times["pix2tex"] = time.time() - start
-        p_text = [normalize_text(p["text"]) for p in predictions]
-        p_scores = score_text(p_text, references)
-        write_data["pix2tex"] = {
-            "scores": p_scores,
-            "text": [{"prediction": p, "reference": r} for p, r in zip(p_text, references)]
-        }
-    if args.nougat:
-        start = time.time()
-        predictions = inference_nougat(source_data)
-        times["nougat"] = time.time() - start
-        n_text = [normalize_text(p) for p in predictions]
-        n_scores = score_text(n_text, references)
-        write_data["nougat"] = {
-            "scores": n_scores,
-            "text": [{"prediction": p, "reference": r} for p, r in zip(n_text, references)]
-        }
-    score_table = []
-    score_headers = ["bleu", "meteor", "edit"]
-    score_dirs = ["⬆", "⬆", "⬇", "⬇"]
-    for method in write_data.keys():
-        score_table.append([method, *[write_data[method]["scores"][h] for h in score_headers], times[method]])
-    score_headers.append("time taken (s)")
-    score_headers = [f"{h} {d}" for h, d in zip(score_headers, score_dirs)]
-    print()
-    print(tabulate(score_table, headers=["Method", *score_headers]))
-    print()
-    print("Higher is better for BLEU and METEOR, lower is better for edit distance and time taken.")
-    print("Note that pix2tex is unbatched (I couldn't find a batch inference method in the docs), so time taken is higher than it should be.")
-    with open(result_path, "w") as f:
-        json.dump(write_data, f, indent=4)
-if __name__ == "__main__":
-    main()

data/.gitignore DELETED Viewed

@@ -1,6 +0,0 @@
-*
-!.gitignore
-!examples
-!examples/*
-!images
-!images/*

data/examples/0.md DELETED Viewed

@@ -1,5 +0,0 @@
-The potential $V_{i}$ of cell $\mathcal{C}_ {j}$ centred at position $\mathbf{r}_ {i}$ is related to the surface charge densities $\sigma_ {j}$ of cells $\mathcal{E}_ {j}$ $j\in[1,N]$ through the superposition principle as:
-$$V_ {i}\,=\,\sum_ {j=0}^{N}\,\frac{\sigma_ {j}}{4\pi\varepsilon_ {0}}\,\int_{\mathcal{E}_ {j}}\frac{1}{\left|\mathbf{r}_ {i}-\mathbf{r}^{\prime}\right|}\,\mathrm{d}^{2}\mathbf{r}^{\prime}\,=\,\sum_{j=0}^{N}\,Q_ {ij}\,\sigma_{j},$$
-where the integral over the surface of cell $\mathcal{C}_ {j}$ only depends on $\mathcal{C}{j}$ shape and on the relative position of the target point $\mathbf{r}_ {i}$ with respect to $\mathcal{C}_ {j}$ location, as $\sigma_ {j}$ is assumed constant over the whole surface of cell $\mathcal{C}_ {j}$.

data/examples/0.png DELETED Viewed

Binary file (24.1 kB)

data/examples/100.md DELETED Viewed

	@@ -1 +0,0 @@
1	- Following , the minimal energy fraction the muon receives in the pion's rest frame is $r_ {\pi}=(m_ {\mu}/m_ {\pi})^2\approx0.57$, when it is emitted against the direction of movement, or 1 when it coincides with the pion's direction.

data/examples/100.png DELETED Viewed

Binary file (11.2 kB)

data/examples/300.md DELETED Viewed

@@ -1,4 +0,0 @@
-$$\mid\frac{1}{x}=\frac{1}{c}\mid=\mid\frac{c-x}{xc}\mid=\frac{1}{\left\vert x\right\vert}\cdot\frac{1}{\left\vert c\right\vert}\cdot\left\vert x-c\right\vert$$
-The factor $$\frac{1}{\left\vert x\right\vert}$$ is not good if its near 0.

data/examples/300.png DELETED Viewed

Binary file (5.48 kB)

data/examples/400.md DELETED Viewed

@@ -1,9 +0,0 @@
-Then the results are that afterward:
-For every value of $\lambda$, there is a probability of $|\langle\Psi|\Psi_\lambda\rangle|^2$ that the system is in state $|\Psi_\lambda\rangle$
-This is captured by the density matrix formalism as the transition
-$|\Psi\rangle\langle\Psi|\Rightarrow\sum_\lambda|\langle\Psi|\Psi_\lambda\rangle|^2|\Psi_\lambda\rangle\langle\Psi_\lambda|$
-atyy I guess thinking about it classically, Demystifier's argument must be right.

data/examples/400.png DELETED Viewed

Binary file (20.2 kB)

data/images/gui_screen.png DELETED Viewed

Binary file (655 kB)

data/images/texify_bench.png DELETED Viewed

Binary file (27.5 kB)

ocr_app.py DELETED Viewed

@@ -1,167 +0,0 @@
-import io
-import pandas as pd
-import streamlit as st
-from streamlit_drawable_canvas import st_canvas
-import hashlib
-import pypdfium2
-from texify.inference import batch_inference
-from texify.model.model import load_model
-from texify.model.processor import load_processor
-from texify.settings import settings
-import subprocess
-import re
-from PIL import Image
-MAX_WIDTH = 1000
-def replace_katex_invalid(string):
-    # KaTeX cannot render all LaTeX, so we need to replace some things
-    string = re.sub(r'\\tag\{.*?\}', '', string)
-    string = re.sub(r'\\Big\{(.*?)\}|\\big\{(.*?)\}', r'\1\2', string)
-    return string
-@st.cache_resource()
-def load_model_cached():
-    return load_model()
-@st.cache_resource()
-def load_processor_cached():
-    return load_processor()
-@st.cache_data()
-def infer_image(pil_image, bbox, temperature):
-    input_img = pil_image.crop(bbox)
-    model_output = batch_inference([input_img], model, processor, temperature=temperature)
-    return model_output[0]
-def open_pdf(pdf_file):
-    stream = io.BytesIO(pdf_file.getvalue())
-    return pypdfium2.PdfDocument(stream)
-@st.cache_data()
-def get_page_image(pdf_file, page_num, dpi=96):
-    doc = open_pdf(pdf_file)
-    renderer = doc.render(
-        pypdfium2.PdfBitmap.to_pil,
-        page_indices=[page_num - 1],
-        scale=dpi / 72,
-    )
-    png = list(renderer)[0]
-    png_image = png.convert("RGB")
-    return png_image
-@st.cache_data()
-def get_uploaded_image(in_file):
-    return Image.open(in_file).convert("RGB")
-@st.cache_data()
-def page_count(pdf_file):
-    doc = open_pdf(pdf_file)
-    return len(doc)
-def get_canvas_hash(pil_image):
-    return hashlib.md5(pil_image.tobytes()).hexdigest()
-@st.cache_data()
-def get_image_size(pil_image):
-    if pil_image is None:
-        return 800, 600
-    height, width = pil_image.height, pil_image.width
-    if width > MAX_WIDTH:
-        scale = MAX_WIDTH / width
-        height = int(height * scale)
-        width = MAX_WIDTH
-    return height, width
-st.set_page_config(layout="wide")
-top_message = """### Texify
-After the model loads, upload an image or a pdf, then draw a box around the equation or text you want to OCR by clicking and dragging. Texify will convert it to Markdown with LaTeX math on the right.
-If you have already cropped your image, select "OCR image" in the sidebar instead.
-"""
-st.markdown(top_message)
-col1, col2 = st.columns([.7, .3])
-model = load_model_cached()
-processor = load_processor_cached()
-in_file = st.sidebar.file_uploader("PDF file or image:", type=["pdf", "png", "jpg", "jpeg", "gif", "webp"])
-if in_file is None:
-    st.stop()
-filetype = in_file.type
-whole_image = False
-if "pdf" in filetype:
-    page_count = page_count(in_file)
-    page_number = st.sidebar.number_input(f"Page number out of {page_count}:", min_value=1, value=1, max_value=page_count)
-    pil_image = get_page_image(in_file, page_number)
-else:
-    pil_image = get_uploaded_image(in_file)
-    whole_image = st.sidebar.button("OCR image")
-temperature = st.sidebar.slider("Generation temperature:", min_value=0.0, max_value=1.0, value=0.0, step=0.05)
-canvas_hash = get_canvas_hash(pil_image) if pil_image else "canvas"
-with col1:
-    # Create a canvas component
-    canvas_result = st_canvas(
-        fill_color="rgba(255, 165, 0, 0.1)",  # Fixed fill color with some opacity
-        stroke_width=1,
-        stroke_color="#FFAA00",
-        background_color="#FFF",
-        background_image=pil_image,
-        update_streamlit=True,
-        height=get_image_size(pil_image)[0],
-        width=get_image_size(pil_image)[1],
-        drawing_mode="rect",
-        point_display_radius=0,
-        key=canvas_hash,
-    )
-if canvas_result.json_data is not None or whole_image:
-    objects = pd.json_normalize(canvas_result.json_data["objects"])  # need to convert obj to str because PyArrow
-    bbox_list = None
-    if objects.shape[0] > 0:
-        boxes = objects[objects["type"] == "rect"][["left", "top", "width", "height"]]
-        boxes["right"] = boxes["left"] + boxes["width"]
-        boxes["bottom"] = boxes["top"] + boxes["height"]
-        bbox_list = boxes[["left", "top", "right", "bottom"]].values.tolist()
-    if whole_image:
-        bbox_list = [(0, 0, pil_image.width, pil_image.height)]
-    if bbox_list:
-        with col2:
-            inferences = [infer_image(pil_image, bbox, temperature) for bbox in bbox_list]
-            for idx, inference in enumerate(reversed(inferences)):
-                st.markdown(f"### {len(inferences) - idx}")
-                katex_markdown = replace_katex_invalid(inference)
-                st.markdown(katex_markdown)
-                st.code(inference)
-                st.divider()
-with col2:
-    tips = """
-    ### Usage tips
-    - Don't make your boxes too small or too large.  See the examples and the video in the [README](https://github.com/vikParuchuri/texify) for more info.
-    - Texify is sensitive to how you draw the box around the text you want to OCR. If you get bad results, try selecting a slightly different box, or splitting the box into multiple.
-    - You can try changing the temperature value on the left if you don't get good results.  This controls how "creative" the model is.
-    - Sometimes KaTeX won't be able to render an equation (red error text), but it will still be valid LaTeX.  You can copy the LaTeX and render it elsewhere.
-    """
-    st.markdown(tips)

ocr_image.py DELETED Viewed

@@ -1,67 +0,0 @@
-import argparse
-import os.path
-from texify.inference import batch_inference
-from texify.model.model import load_model
-from texify.model.processor import load_processor
-from PIL import Image
-from texify.settings import settings
-from texify.util import is_valid_image
-import json
-def inference_single_image(image_path, json_path, model, processor):
-    image = Image.open(image_path)
-    text = batch_inference([image], model, processor)
-    write_data = [{"image_path": image_path, "text": text[0]}]
-    with open(json_path, "w+") as f:
-        json_repr = json.dumps(write_data, indent=4)
-        f.write(json_repr)
-def inference_image_dir(image_dir, json_path, model, processor, max=None):
-    image_paths = [os.path.join(image_dir, image_name) for image_name in os.listdir(image_dir)]
-    image_paths = [ip for ip in image_paths if is_valid_image(ip)]
-    if max:
-        image_paths = image_paths[:max]
-    write_data = []
-    for i in range(0, len(image_paths), settings.BATCH_SIZE):
-        batch = image_paths[i:i+settings.BATCH_SIZE]
-        images = [Image.open(image_path) for image_path in batch]
-        text = batch_inference(images, model, processor)
-        for image_path, t in zip(batch, text):
-            write_data.append({"image_path": image_path, "text": t})
-    with open(json_path, "w+") as f:
-        json_repr = json.dumps(write_data, indent=4)
-        f.write(json_repr)
-def main():
-    parser = argparse.ArgumentParser(description="OCR an image of a LaTeX equation.")
-    parser.add_argument("image", type=str, help="Path to image or folder of images to OCR.")
-    parser.add_argument("--max", type=int, help="Maximum number of images to OCR if a folder is passes.", default=None)
-    parser.add_argument("--json_path", type=str, help="Path to JSON file to save results to.", default=os.path.join(settings.DATA_DIR, "results.json"))
-    args = parser.parse_args()
-    image_path = args.image
-    model = load_model()
-    processor = load_processor()
-    json_path = os.path.abspath(args.json_path)
-    os.makedirs(os.path.dirname(json_path), exist_ok=True)
-    if os.path.isfile(image_path):
-        inference_single_image(image_path, json_path, model, processor)
-    else:
-        inference_image_dir(image_path, json_path, model, processor, args.max)
-    print(f"Wrote results to {json_path}")
-if __name__ == "__main__":
-    main()

poetry.lock DELETED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml DELETED Viewed

@@ -1,47 +0,0 @@
-[tool.poetry]
-name = "texify"
-version = "0.1.6"
-description = "OCR for latex images"
-authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
-readme = "README.md"
-license = "GPL-3.0-or-later"
-repository = "https://github.com/VikParuchuri/texify"
-keywords = ["ocr", "latex", "markdown", "pdf"]
-include = [
-    "ocr_app.py",
-    "ocr_image.py",
-    "run_ocr_app.py",
-    "benchmark.py"
-]
-[tool.poetry.dependencies]
-python = ">=3.10,<4.0"
-streamlit = "^1.29.0"
-transformers = "^4.36.2"
-torch = "^2.1.2"
-pydantic = "^2.5.2"
-pydantic-settings = "^2.1.0"
-Pillow = "^10.1.0"
-numpy = "^1.26.2"
-pypdfium2 = "^4.25.0"
-python-dotenv = "^1.0.0"
-watchdog = "^3.0.0"
-ftfy = "^6.1.3"
-tabulate = "^0.9.0"
-streamlit-drawable-canvas-jsretry = "^0.9.3"
-[tool.poetry.group.dev.dependencies]
-jupyter = "^1.0.0"
-evaluate = "^0.4.1"
-rapidfuzz = "^3.5.2"
-pyperclip = "^1.8.2"
-nltk = "^3.8.1"
-[tool.poetry.scripts]
-texify = "ocr_image:main"
-texify_gui = "run_ocr_app:run_app"
-texify_benchmark = "benchmark:main"
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 transformers
-torch
-accelerate

 transformers
+torch==2.2.0
+accelerate
+flash-attn==2.2.0

run_ocr_app.py DELETED Viewed

@@ -1,8 +0,0 @@
-import subprocess
-import os
-def run_app():
-    cur_dir = os.path.dirname(os.path.abspath(__file__))
-    ocr_app_path = os.path.join(cur_dir, "ocr_app.py")
-    subprocess.run(["streamlit", "run", ocr_app_path])

scripts/verify_benchmark_scores.py DELETED Viewed

@@ -1,20 +0,0 @@
-import json
-import argparse
-def verify_scores(file_path):
-    with open(file_path, 'r') as file:
-        data = json.load(file)
-    scores = data["texify"]["scores"]
-    if scores["bleu"] <= 0.6 or scores["meteor"] <= 0.6 or scores["edit"] > 0.2:
-        print(scores)
-        raise ValueError("Scores do not meet the required threshold")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Verify benchmark scores")
-    parser.add_argument("file_path", type=str, help="Path to the json file")
-    args = parser.parse_args()
-    verify_scores(args.file_path)