Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on 12 days ago

Commit

1681237

•

1 Parent(s): 148e0d6

Upload 10 files

Browse files

from https://github.com/ArneBinder/pie-document-level/pull/221

Files changed (6) hide show

app.py +59 -13
document_store.py +0 -2
embedding.py +124 -0
model_utils.py +68 -116
rendering_utils.py +4 -1
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -7,8 +7,10 @@ from typing import List, Optional, Tuple
 import gradio as gr
 import pandas as pd
 from document_store import DocumentStore, get_annotation_from_document
-from model_utils import create_and_annotate_document, load_models
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
@@ -26,6 +28,9 @@ DEFAULT_MODEL_REVISION = "76300f8e534e2fcf695f00cb49bba166739b8d8a"
 # DEFAULT_MODEL_NAME = "models/dataset-sciarg/task-ner_re/v0.3/2024-05-28_23-33-46"
 # DEFAULT_MODEL_REVISION = None
 DEFAULT_EMBEDDING_MODEL_NAME = "allenai/scibert_scivocab_uncased"
 def render_annotated_document(
@@ -47,22 +52,26 @@ def render_annotated_document(
 def wrapped_process_text(
     text: str,
     doc_id: str,
-    models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
     document_store: DocumentStore,
 ) -> Tuple[dict, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
-    document = create_and_annotate_document(
-        text=text,
-        doc_id=doc_id,
-        models=models,
-    )
-    document_store.add_document(document)
     # Return as dict and document to avoid serialization issues
     return document.asdict(), document
 def process_uploaded_files(
     file_names: List[str],
-    models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
     document_store: DocumentStore,
 ) -> pd.DataFrame:
     try:
@@ -74,7 +83,16 @@ def process_uploaded_files(
                     text = f.read()
                 base_file_name = os.path.basename(file_name)
                 gr.Info(f"Processing file '{base_file_name}' ...")
-                new_documents.append(create_and_annotate_document(text, base_file_name, models))
             else:
                 raise gr.Error(f"Unsupported file format: {file_name}")
         document_store.add_documents(new_documents)
@@ -143,10 +161,13 @@ def main():
     example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
     print("Loading models ...")
-    argumentation_model, embedding_model, embedding_tokenizer = load_models(
         model_name=DEFAULT_MODEL_NAME,
         revision=DEFAULT_MODEL_REVISION,
         embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
     )
     default_render_kwargs = {
@@ -179,7 +200,7 @@ def main():
             DocumentStore(span_annotation_caption="adu", relation_annotation_caption="relation")
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
-        models_state = gr.State((argumentation_model, embedding_model, embedding_tokenizer))
         with gr.Row():
             with gr.Column(scale=1):
                 doc_id = gr.Textbox(
@@ -204,10 +225,35 @@ def main():
                         label=f"Embedding Model Name (e.g. {DEFAULT_EMBEDDING_MODEL_NAME})",
                         value=DEFAULT_EMBEDDING_MODEL_NAME,
                     )
                     load_models_btn = gr.Button("Load Models")
                     load_models_btn.click(
                         fn=load_models,
-                        inputs=[model_name, model_revision, embedding_model_name],
                         outputs=models_state,
                     )

 import gradio as gr
 import pandas as pd
+import torch
 from document_store import DocumentStore, get_annotation_from_document
+from embedding import EmbeddingModel
+from model_utils import annotate_document, create_document, load_models
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 # DEFAULT_MODEL_NAME = "models/dataset-sciarg/task-ner_re/v0.3/2024-05-28_23-33-46"
 # DEFAULT_MODEL_REVISION = None
 DEFAULT_EMBEDDING_MODEL_NAME = "allenai/scibert_scivocab_uncased"
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DEFAULT_EMBEDDING_MAX_LENGTH = 512
+DEFAULT_EMBEDDING_BATCH_SIZE = 32
 def render_annotated_document(
 def wrapped_process_text(
     text: str,
     doc_id: str,
+    models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
 ) -> Tuple[dict, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
+    try:
+        document = create_document(text=text, doc_id=doc_id)
+        annotate_document(
+            document=document,
+            annotation_pipeline=models[0],
+            embedding_model=models[1],
+        )
+        document_store.add_document(document)
+    except Exception as e:
+        raise gr.Error(f"Failed to process text: {e}")
     # Return as dict and document to avoid serialization issues
     return document.asdict(), document
 def process_uploaded_files(
     file_names: List[str],
+    models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
 ) -> pd.DataFrame:
     try:
                     text = f.read()
                 base_file_name = os.path.basename(file_name)
                 gr.Info(f"Processing file '{base_file_name}' ...")
+                new_document = create_document(
+                    text=text,
+                    doc_id=base_file_name,
+                )
+                annotate_document(
+                    document=new_document,
+                    annotation_pipeline=models[0],
+                    embedding_model=models[1],
+                )
+                new_documents.append(new_document)
             else:
                 raise gr.Error(f"Unsupported file format: {file_name}")
         document_store.add_documents(new_documents)
     example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
     print("Loading models ...")
+    argumentation_model, embedding_model = load_models(
         model_name=DEFAULT_MODEL_NAME,
         revision=DEFAULT_MODEL_REVISION,
         embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
+        embedding_max_length=DEFAULT_EMBEDDING_MAX_LENGTH,
+        embedding_batch_size=DEFAULT_EMBEDDING_BATCH_SIZE,
+        device=DEFAULT_DEVICE,
     )
     default_render_kwargs = {
             DocumentStore(span_annotation_caption="adu", relation_annotation_caption="relation")
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
+        models_state = gr.State((argumentation_model, embedding_model))
         with gr.Row():
             with gr.Column(scale=1):
                 doc_id = gr.Textbox(
                         label=f"Embedding Model Name (e.g. {DEFAULT_EMBEDDING_MODEL_NAME})",
                         value=DEFAULT_EMBEDDING_MODEL_NAME,
                     )
+                    embedding_max_length = gr.Slider(
+                        label="Embedding Model Max Length",
+                        minimum=16,
+                        maximum=2048,
+                        step=8,
+                        value=DEFAULT_EMBEDDING_MAX_LENGTH,
+                    )
+                    embedding_batch_size = gr.Slider(
+                        label="Embedding Model Batch Size",
+                        minimum=1,
+                        maximum=128,
+                        step=1,
+                        value=DEFAULT_EMBEDDING_BATCH_SIZE,
+                    )
+                    device = gr.Textbox(
+                        label="Device (e.g. 'cuda' or 'cpu')",
+                        value=DEFAULT_DEVICE,
+                    )
                     load_models_btn = gr.Button("Load Models")
                     load_models_btn.click(
                         fn=load_models,
+                        inputs=[
+                            model_name,
+                            model_revision,
+                            embedding_model_name,
+                            embedding_max_length,
+                            embedding_batch_size,
+                            device,
+                        ],
                         outputs=models_state,
                     )

document_store.py CHANGED Viewed

@@ -307,8 +307,6 @@ class DocumentStore:
     def add_document_from_dict(self, document_dict: dict) -> None:
         document = self.document_type.fromdict(document_dict)
-        # metadata is not automatically deserialized, so we need to set it manually
-        document.metadata = document_dict["metadata"]
         self.add_document(document)
     def add_documents(self, documents: List[TextBasedDocument]) -> None:

     def add_document_from_dict(self, document_dict: dict) -> None:
         document = self.document_type.fromdict(document_dict)
         self.add_document(document)
     def add_documents(self, documents: List[TextBasedDocument]) -> None:

embedding.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import abc
+import logging
+from typing import Dict
+import torch
+from datasets import Dataset
+from pie_modules.document.processing import tokenize_document
+from pie_modules.documents import TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+from pytorch_ie.annotations import Span
+from pytorch_ie.documents import TextBasedDocument
+from torch import FloatTensor, Tensor
+from torch.utils.data import DataLoader
+from transformers import AutoModel, AutoTokenizer
+logger = logging.getLogger(__name__)
+class EmbeddingModel(abc.ABC):
+    def __call__(
+        self, document: TextBasedDocument, span_layer_name: str
+    ) -> Dict[Span, FloatTensor]:
+        """Embed text annotations from a document.
+        Args:
+            document: The document to embed.
+            span_layer_name: The name of the annotation layer in the document that contains the
+                text span annotations to embed.
+        Returns:
+            A dictionary mapping text annotations to their embeddings.
+        """
+        pass
+class HuggingfaceEmbeddingModel(EmbeddingModel):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        revision: str = None,
+        device: str = "cpu",
+        max_length: int = 512,
+        batch_size: int = 16,
+    ):
+        self.load(model_name_or_path, revision, device)
+        self.max_length = max_length
+        self.batch_size = batch_size
+    def load(self, model_name_or_path: str, revision: str = None, device: str = "cpu") -> None:
+        self._model = AutoModel.from_pretrained(model_name_or_path, revision=revision).to(device)
+        self._tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision=revision)
+    def __call__(
+        self, document: TextBasedDocument, span_layer_name: str
+    ) -> Dict[Span, FloatTensor]:
+        # to not modify the original document
+        document = document.copy()
+        # tokenize_document does not yet consider predictions, so we need to add them manually
+        document[span_layer_name].extend(document[span_layer_name].predictions.clear())
+        added_annotations = []
+        tokenizer_kwargs = {
+            "max_length": self.max_length,
+            "stride": self.max_length // 8,
+            "truncation": True,
+            "padding": True,
+            "return_overflowing_tokens": True,
+        }
+        # tokenize once to get the tokenized documents with mapped annotations
+        tokenized_documents = tokenize_document(
+            document,
+            tokenizer=self._tokenizer,
+            result_document_type=TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+            partition_layer="labeled_partitions",
+            added_annotations=added_annotations,
+            strict_span_conversion=False,
+            **tokenizer_kwargs,
+        )
+        # just tokenize again to get tensors in the correct format for the model
+        dataset = Dataset.from_dict({"text": [document.text]})
+        def tokenize_function(examples):
+            return self._tokenizer(examples["text"], **tokenizer_kwargs)
+        # Tokenize the texts. Note that we remove the text column directly in the map call,
+        # otherwise the map would fail because we produce we amy produce multipel new rows
+        # (tokenization result) for each input row (text).
+        tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+        # remove the overflow_to_sample_mapping column
+        tokenized_dataset = tokenized_dataset.remove_columns(["overflow_to_sample_mapping"])
+        tokenized_dataset.set_format(type="torch")
+        dataloader = DataLoader(tokenized_dataset, batch_size=self.batch_size)
+        embeddings = {}
+        example_idx = 0
+        for batch in dataloader:
+            batch_at_device = {
+                k: v.to(self._model.device) if isinstance(v, Tensor) else v
+                for k, v in batch.items()
+            }
+            with torch.no_grad():
+                model_output = self._model(**batch_at_device)
+            for last_hidden_state in model_output.last_hidden_state:
+                text2tok_ann = added_annotations[example_idx][span_layer_name]
+                tok2text_ann = {v: k for k, v in text2tok_ann.items()}
+                for tok_ann in tokenized_documents[example_idx].labeled_spans:
+                    # skip "empty" annotations
+                    if tok_ann.start == tok_ann.end:
+                        continue
+                    # use the max pooling strategy to get a single embedding for the annotation text
+                    embedding = (
+                        last_hidden_state[tok_ann.start : tok_ann.end].max(dim=0)[0].detach().cpu()
+                    )
+                    text_ann = tok2text_ann[tok_ann]
+                    if text_ann in embeddings:
+                        logger.warning(
+                            f"Overwriting embedding for annotation '{text_ann}' (do you use striding?)"
+                        )
+                    embeddings[text_ann] = embedding
+                example_idx += 1
+        return embeddings

model_utils.py CHANGED Viewed

@@ -1,98 +1,45 @@
 import logging
-from typing import Dict, List, Optional, Tuple
 import gradio as gr
 from annotation_utils import labeled_span_to_id
-from pie_modules.document.processing import tokenize_document
-from pie_modules.documents import TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.auto import AutoPipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
 logger = logging.getLogger(__name__)
-def _embed_text_annotations(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-    model: PreTrainedModel,
-    tokenizer: PreTrainedTokenizer,
-    text_layer_name: str,
-) -> Dict[LabeledSpan, List[float]]:
-    # to not modify the original document
-    document = document.copy()
-    # tokenize_document does not yet consider predictions, so we need to add them manually
-    document[text_layer_name].extend(document[text_layer_name].predictions.clear())
-    added_annotations = []
-    tokenizer_kwargs = {
-        "max_length": 512,
-        "stride": 64,
-        "truncation": True,
-        "return_overflowing_tokens": True,
-    }
-    tokenized_documents = tokenize_document(
-        document,
-        tokenizer=tokenizer,
-        result_document_type=TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-        partition_layer="labeled_partitions",
-        added_annotations=added_annotations,
-        strict_span_conversion=False,
-        **tokenizer_kwargs,
-    )
-    # just tokenize again to get tensors in the correct format for the model
-    model_inputs = tokenizer(document.text, return_tensors="pt", **tokenizer_kwargs)
-    # this is added when using return_overflowing_tokens=True, but the model does not accept it
-    model_inputs.pop("overflow_to_sample_mapping", None)
-    assert len(model_inputs.encodings) == len(tokenized_documents)
-    model_output = model(**model_inputs)
-    # get embeddings for all text annotations
-    embeddings = {}
-    for batch_idx in range(len(model_output.last_hidden_state)):
-        text2tok_ann = added_annotations[batch_idx][text_layer_name]
-        tok2text_ann = {v: k for k, v in text2tok_ann.items()}
-        for tok_ann in tokenized_documents[batch_idx].labeled_spans:
-            # skip "empty" annotations
-            if tok_ann.start == tok_ann.end:
-                continue
-            # use the max pooling strategy to get a single embedding for the annotation text
-            embedding = model_output.last_hidden_state[batch_idx, tok_ann.start : tok_ann.end].max(
-                dim=0
-            )[0]
-            text_ann = tok2text_ann[tok_ann]
-            if text_ann in embeddings:
-                logger.warning(
-                    f"Overwriting embedding for annotation '{text_ann}' (do you use striding?)"
-                )
-            embeddings[text_ann] = embedding
-    return embeddings
-def _annotate(
-    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-    pipeline: Pipeline,
-    embedding_model: Optional[PreTrainedModel] = None,
-    embedding_tokenizer: Optional[PreTrainedTokenizer] = None,
 ) -> None:
     # execute prediction pipeline
-    pipeline(document)
-    if embedding_model is not None and embedding_tokenizer is not None:
-        adu_embeddings = _embed_text_annotations(
             document=document,
-            model=embedding_model,
-            tokenizer=embedding_tokenizer,
-            text_layer_name="labeled_spans",
         )
         # convert keys to str because JSON keys must be strings
-        adu_embeddings_dict = {
-            labeled_span_to_id(k): v.detach().tolist() for k, v in adu_embeddings.items()
         }
-        document.metadata["embeddings"] = adu_embeddings_dict
     else:
         gr.Warning(
             "No embedding model provided. Skipping embedding extraction. You can load an embedding "
@@ -100,47 +47,47 @@ def _annotate(
         )
-def create_and_annotate_document(
-    text: str,
-    doc_id: str,
-    models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     """Create a TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
-    text, annotate it, and add it to the index.
     Parameters:
         text: The text to process.
         doc_id: The ID of the document.
-        models: A tuple containing the prediction pipeline and the embedding model and tokenizer.
     Returns:
         The processed document.
     """
-    try:
-        document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(
-            id=doc_id, text=text, metadata={}
-        )
-        # add single partition from the whole text (the model only considers text in partitions)
-        document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
-        # annotate the document
-        _annotate(
-            document=document,
-            pipeline=models[0],
-            embedding_model=models[1],
-            embedding_tokenizer=models[2],
-        )
-        return document
-    except Exception as e:
-        raise gr.Error(f"Failed to process text: {e}")
-def load_argumentation_model(model_name: str, revision: Optional[str] = None) -> Pipeline:
     try:
         model = AutoPipeline.from_pretrained(
             model_name,
-            device=-1,
             num_workers=0,
             taskmodule_kwargs=dict(revision=revision),
             model_kwargs=dict(revision=revision),
@@ -151,23 +98,28 @@ def load_argumentation_model(model_name: str, revision: Optional[str] = None) ->
     return model
-def load_embedding_model(model_name: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
-    try:
-        embedding_model = AutoModel.from_pretrained(model_name)
-        embedding_tokenizer = AutoTokenizer.from_pretrained(model_name)
-    except Exception as e:
-        raise gr.Error(f"Failed to load embedding model: {e}")
-    gr.Info(f"Loaded embedding model: model_name={model_name})")
-    return embedding_model, embedding_tokenizer
 def load_models(
-    model_name: str, revision: Optional[str] = None, embedding_model_name: Optional[str] = None
-) -> Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]]:
-    argumentation_model = load_argumentation_model(model_name, revision)
     embedding_model = None
-    embedding_tokenizer = None
     if embedding_model_name is not None and embedding_model_name.strip():
-        embedding_model, embedding_tokenizer = load_embedding_model(embedding_model_name)
-    return argumentation_model, embedding_model, embedding_tokenizer

 import logging
+from typing import Optional, Tuple
 import gradio as gr
+import torch
 from annotation_utils import labeled_span_to_id
+from embedding import EmbeddingModel, HuggingfaceEmbeddingModel
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.auto import AutoPipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 logger = logging.getLogger(__name__)
+def annotate_document(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+    annotation_pipeline: Pipeline,
+    embedding_model: Optional[EmbeddingModel] = None,
 ) -> None:
+    """Annotate a document with the provided pipeline. If an embedding model is provided, also
+    extract embeddings for the labeled spans.
+    Args:
+        document: The document to annotate.
+        annotation_pipeline: The pipeline to use for annotation.
+        embedding_model: The embedding model to use for extracting text span embeddings.
+    """
     # execute prediction pipeline
+    annotation_pipeline(document)
+    if embedding_model is not None:
+        text_span_embeddings = embedding_model(
             document=document,
+            span_layer_name="labeled_spans",
         )
         # convert keys to str because JSON keys must be strings
+        text_span_embeddings_dict = {
+            labeled_span_to_id(k): v.tolist() for k, v in text_span_embeddings.items()
         }
+        document.metadata["embeddings"] = text_span_embeddings_dict
     else:
         gr.Warning(
             "No embedding model provided. Skipping embedding extraction. You can load an embedding "
         )
+def create_document(
+    text: str, doc_id: str
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     """Create a TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
+    text.
     Parameters:
         text: The text to process.
         doc_id: The ID of the document.
     Returns:
         The processed document.
     """
+    document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(
+        id=doc_id, text=text, metadata={}
+    )
+    # add single partition from the whole text (the model only considers text in partitions)
+    document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
+    return document
+def load_argumentation_model(
+    model_name: str,
+    revision: Optional[str] = None,
+    device: str = "cpu",
+) -> Pipeline:
     try:
+        # the Pipeline class expects an integer for the device
+        if device == "cuda":
+            pipeline_device = 0
+        elif device.startswith("cuda:"):
+            pipeline_device = int(device.split(":")[1])
+        elif device == "cpu":
+            pipeline_device = -1
+        else:
+            raise gr.Error(f"Invalid device: {device}")
         model = AutoPipeline.from_pretrained(
             model_name,
+            device=pipeline_device,
             num_workers=0,
             taskmodule_kwargs=dict(revision=revision),
             model_kwargs=dict(revision=revision),
     return model
 def load_models(
+    model_name: str,
+    revision: Optional[str] = None,
+    embedding_model_name: Optional[str] = None,
+    # embedding_model_revision: Optional[str] = None,
+    embedding_max_length: int = 512,
+    embedding_batch_size: int = 16,
+    device: str = "cpu",
+) -> Tuple[Pipeline, Optional[EmbeddingModel]]:
+    torch.cuda.empty_cache()
+    argumentation_model = load_argumentation_model(model_name, revision=revision, device=device)
     embedding_model = None
     if embedding_model_name is not None and embedding_model_name.strip():
+        try:
+            embedding_model = HuggingfaceEmbeddingModel(
+                embedding_model_name.strip(),
+                # revision=embedding_model_revision,
+                device=device,
+                max_length=embedding_max_length,
+                batch_size=embedding_batch_size,
+            )
+        except Exception as e:
+            raise gr.Error(f"Failed to load embedding model: {e}")
+    return argumentation_model, embedding_model

rendering_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from collections import defaultdict
 from typing import Dict, List, Optional, Union
@@ -7,6 +8,8 @@ from pytorch_ie.annotations import BinaryRelation
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils_displacy import EntityRenderer
 def render_pretty_table(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions, **render_kwargs
@@ -92,7 +95,7 @@ def inject_relation_data(
         entity_annotation = sorted_entities[idx]
         # sanity check
         if str(entity_annotation) != entity.next:
-            raise ValueError(f"Entity text mismatch: {entity_annotation} != {entity.text}")
         entity["data-label"] = entity_annotation.label
         entity["data-relation-tails"] = json.dumps(
             [

 import json
+import logging
 from collections import defaultdict
 from typing import Dict, List, Optional, Union
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils_displacy import EntityRenderer
+logger = logging.getLogger(__name__)
 def render_pretty_table(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions, **render_kwargs
         entity_annotation = sorted_entities[idx]
         # sanity check
         if str(entity_annotation) != entity.next:
+            logger.warning(f"Entity text mismatch: {entity_annotation} != {entity.text}")
         entity["data-label"] = entity_annotation.label
         entity["data-relation-tails"] = json.dumps(
             [

requirements.txt CHANGED Viewed

@@ -2,3 +2,6 @@ gradio==4.36.0
 prettytable==3.10.0
 pie-modules==0.12.0
 beautifulsoup4==4.12.3

 prettytable==3.10.0
 pie-modules==0.12.0
 beautifulsoup4==4.12.3
+datasets==2.14.4
+# numpy 2.0.0 breaks the code
+numpy==1.25.2