Spaces:

somosnlp
/

NoticIA-demo

Runtime error

App Files Files Community

Iker commited on Mar 28, 2024

Commit

12ea223

verified ·

1 Parent(s): a3284de

Upload 7 files

Browse files

Files changed (7) hide show

README.md +13 -6
app.py +255 -0
cache_system.py +61 -0
header.py +57 -0
prompts.py +31 -0
requirements.txt +6 -0
utils.py +54 -0

README.md CHANGED Viewed

@@ -1,13 +1,20 @@
 ---
-title: NoticIA Demo
-emoji: 🐢
-colorFrom: pink
-colorTo: indigo
 sdk: gradio
-sdk_version: 4.24.0
-app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NotiCIA
+emoji: 📰
+colorFrom: indigo
+colorTo: pink
 sdk: gradio
 pinned: false
 license: apache-2.0
+suggested_hardware: t4-small
+suggested_storage: small
+app_file: app.py
+fullWidth: true
+models:
+   - somosnlp/NoticIA-7B
+tags:
+   - summarization
+   - clickbait
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import datetime
+import gradio as gr
+import torch
+from cache_system import CacheHandler
+from header import article, header
+from newspaper import Article
+from prompts import summarize_clickbait_short_prompt
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    GenerationConfig,
+    LogitsProcessorList,
+    TextStreamer,
+)
+from utils import StopAfterTokenIsGenerated
+total_runs = 0
+# Cargar el tokenizador
+tokenizer = AutoTokenizer.from_pretrained("somosnlp/NoticIA-7B")
+# Cargamos el modelo en 4 bits para usar menos VRAM
+# Usamos bitsandbytes por que es lo más sencillo de implementar para la demo aunque no es ni lo más rápido ni lo más eficiente
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "somosnlp/NoticIA-7B",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config,
+)
+# Parámetros de generación.
+generation_config = GenerationConfig(
+    max_new_tokens=128,  # Los resúmenes son cortos, no necesitamos más tokens
+    min_new_tokens=1,  # No queremos resúmenes vacíos
+    do_sample=True,  # Un poquito mejor que greedy sampling
+    num_beams=1,
+    use_cache=True,  # Eficiencia
+    top_k=40,
+    top_p=0.1,
+    repetition_penalty=1.1,  # Ayuda a evitar que el modelo entre en bucles
+    encoder_repetition_penalty=1.1,  # Favorecemos que el modelo cite el texto original
+    resumenerature=0.15,  #  resumeneratura baja para evitar que el modelo genere texto muy creativo.
+)
+# Stop words, para evitar que el modelo genere tokens que no queremos.
+stop_words = [
+    "<s>",
+    "</s>",
+    "\\n",
+    "[/INST]",
+    "[INST]",
+    "### User:",
+    "### Assistant:",
+    "###",
+    "<start_of_turn>" "<end_of_turn>" "<end_of_turn>\n" "<end_of_turn>\\n",
+    "<eos>",
+]
+# Creamos un logits processor para detener la generación cuando el modelo genere un stop word
+stop_criteria = LogitsProcessorList(
+    [
+        StopAfterTokenIsGenerated(
+            stops=[
+                torch.tensor(tokenizer.encode(stop_word, add_special_tokens=False))
+                for stop_word in stop_words.copy()
+            ],
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    ]
+)
+def generate_text(url: str) -> (str, str):
+    """
+    Dada una URL de una noticia, genera un resumen de una sola frase que revela la verdad detrás del titular.
+    Args:
+        url (str): URL de la noticia.
+    Returns:
+        str: Titular de la noticia.
+        str: Resumen de la noticia.
+    """
+    global cache_handler
+    global total_runs
+    total_runs += 1
+    print(f"Total runs: {total_runs}. Last run: {datetime.datetime.now()}")
+    url = url.strip()
+    if url.startswith("https://twitter.com/") or url.startswith("https://x.com/"):
+        yield (
+            "🤖 Vaya, parece que has introducido la url de un tweet. No puedo acceder a tweets, tienes que introducir la URL de una noticia.",
+            "❌❌❌ Si el tweet contiene una noticia, dame la URL de la noticia ❌❌❌",
+            "Error",
+        )
+        return (
+            "🤖 Vaya, parece que has introducido la url de un tweet. No puedo acceder a tweets, tienes que introducir la URL de una noticia.",
+            "❌❌❌ Si el tweet contiene una noticia, dame la URL de la noticia ❌❌❌",
+            "Error",
+        )
+    # 1) Download the article
+    # progress(0, desc="🤖 Accediendo a la noticia")
+    # First, check if the URL is in the cache
+    headline, text, resumen = cache_handler.get_from_cache(url, 0)
+    if headline is not None and text is not None and resumen is not None:
+        yield headline, resumen
+        return headline, resumen
+    else:
+        try:
+            article = Article(url)
+            article.download()
+            article.parse()
+            headline = article.title
+            text = article.text
+        except Exception as e:
+            print(e)
+            headline = None
+            text = None
+        if headline is None or text is None:
+            yield (
+                "🤖 No he podido acceder a la notica, asegurate que la URL es correcta y que es posible acceder a la noticia desde un navegador.",
+                "❌❌❌ Inténtalo de nuevo ❌❌❌",
+                "Error",
+            )
+            return (
+                "🤖 No he podido acceder a la notica, asegurate que la URL es correcta y que es posible acceder a la noticia desde un navegador.",
+                "❌❌❌ Inténtalo de nuevo ❌❌❌",
+                "Error",
+            )
+        # progress(0.5, desc="🤖 Leyendo noticia")
+        try:
+            prompt = summarize_clickbait_short_prompt(headline=headline, body=text)
+            formatted_prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            model_inputs = tokenizer(
+                [formatted_prompt], return_tensors="pt", add_special_tokens=False
+            )
+            streamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)
+            model_output = model.generate(
+                **model_inputs.to(model.device),
+                streamer=streamer,
+                generation_config=generation_config,
+                logits_processor=stop_criteria,
+            )
+            yield headline, streamer
+            resumen = tokenizer.batch_decode(
+                model_output,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )[0].replace("<|end_of_turn|>", "")
+            resumen = resumen.split("GPT4 Correct Assistant:")[-1]
+        except Exception as e:
+            print(e)
+            yield (
+                "🤖 Error en la generación.",
+                "❌❌❌ Inténtalo de nuevo más tarde ❌❌❌",
+                "Error",
+            )
+            return (
+                "🤖 Error en la generación.",
+                "❌❌❌ Inténtalo de nuevo más tarde ❌❌❌",
+                "Error",
+            )
+        cache_handler.add_to_cache(
+            url=url, title=headline, text=text, summary_type=0, summary=resumen
+        )
+        yield headline, resumen
+    hits, misses, cache_len = cache_handler.get_cache_stats()
+    print(
+        f"Hits: {hits}, misses: {misses}, cache length: {cache_len}. Percent hits: {round(hits/(hits+misses)*100,2)}%."
+    )
+    return headline, resumen
+# Usamos una cache para guardar las últimas URL procesadas
+# Los usuarios seguramente introducirán en un mismo día la misma URL varias veces, por que
+# diferentes personas querrán ver el resumen de la misma noticia.
+# La cache se encarga de guardar los resúmenes de las noticias para que no tengamos que volver a generarlos.
+# La cache tiene un tamaño máximo de 1000 elementos, cuando se llena, se elimina el elemento más antiguo.
+cache_handler = CacheHandler(max_cache_size=1000)
+demo = gr.Interface(
+    generate_text,
+    inputs=[
+        gr.Textbox(
+            label="🌐 URL de la noticia",
+            info="Introduce la URL de la noticia que deseas resumir.",
+            value="https://somosnlp.org/",
+            interactive=True,
+        )
+    ],
+    outputs=[
+        gr.Textbox(
+            label="📰 Titular de la noticia",
+            interactive=False,
+            placeholder="Aquí aparecerá el título de la noticia",
+        ),
+        gr.Textbox(
+            label="🗒️ Resumen",
+            interactive=False,
+            placeholder="Aquí aparecerá el resumen de la noticia.",
+        ),
+    ],
+    # headline="⚔️ Clickbait Fighter! ⚔️",
+    thumbnail="https://huggingface.co/datasets/Iker/NoticIA/resolve/main/assets/logo.png",
+    theme="JohnSmith9982/small_and_pretty",
+    description=header,
+    article=article,
+    cache_examples=False,
+    concurrency_limit=1,
+    examples=[
+        "https://www.huffingtonpost.es/virales/le-compra-abrigo-abuela-97nos-reaccion-fantasia.html",
+        "https://emisorasunidas.com/2023/12/29/que-pasara-el-15-de-enero-de-2024/",
+        "https://www.huffingtonpost.es/virales/llega-espana-le-llama-atencion-nombres-propios-persona.html",
+        "https://www.infobae.com/que-puedo-ver/2023/11/19/la-comedia-familiar-y-navidena-que-ya-esta-en-netflix-y-puedes-ver-en-estas-fiestas/",
+        "https://www.cope.es/n/1610984",
+    ],
+    submit_btn="Generar resumen",
+    stop_btn="Detener generación",
+    clear_btn="Limpiar",
+    allow_flagging=False,
+)
+demo.queue(max_size=None)
+demo.launch(share=False)

cache_system.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from collections import OrderedDict
+from datetime import datetime
+from typing import Optional
+class CacheHandler:
+    def __init__(self, max_cache_size: int = 1000):
+        # Using OrderedDict to maintain the order of insertion for efficient removal of oldest items
+        self.cache = OrderedDict()
+        self.max_cache_size = max_cache_size
+        self.misses = 0
+        self.hits = 0
+    def add_to_cache(
+        self, url: str, title: str, text: str, summary_type: int, summary: str
+    ):
+        # If URL already exists, update it and move it to the end to mark it as the most recently used
+        if url in self.cache:
+            self.cache.move_to_end(url)
+            self.cache[url][f"summary_{summary_type}"] = summary
+            self.cache[url]["date"] = datetime.now()
+        else:
+            # Add new entry to the cache
+            self.cache[url] = {
+                "title": title,
+                "text": text,
+                "date": datetime.now(),
+                "summary_0": summary if summary_type == 0 else None,
+                "summary_50": summary if summary_type == 50 else None,
+                "summary_100": summary if summary_type == 100 else None,
+            }
+            # Remove the oldest item if cache exceeds max size
+            if len(self.cache) > self.max_cache_size:
+                self.cache.move_to_end(
+                    "https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"
+                )  # This is the default value in the demo, so we don't want to remove it
+                self.cache.popitem(last=False)  # pop the oldest item
+    def get_from_cache(
+        self, url: str, summary_type: int, second_try: bool = False
+    ) -> Optional[tuple]:
+        if url in self.cache and self.cache[url][f"summary_{summary_type}"] is not None:
+            # Move the accessed item to the end to mark it as recently used
+            self.cache.move_to_end(url)
+            self.hits += 1
+            if second_try:
+                # In the first try we didn't get the cache hit, probably because it was a shortened URL
+                # So me decrease the number of misses, because we got the cache hit in the end
+                self.misses -= 1
+            return (
+                self.cache[url]["title"],
+                self.cache[url]["text"],
+                self.cache[url][f"summary_{summary_type}"],
+            )
+        else:
+            if not second_try:
+                self.misses += 1
+            return None, None, None
+    def get_cache_stats(self):
+        return self.hits, self.misses, len(self.cache)

header.py ADDED Viewed

	@@ -0,0 +1,57 @@

+header = """
+<p align="center">
+    <img src="https://huggingface.co/datasets/Iker/NoticIA/resolve/main/assets/logo.png" style="width: 50%;">
+</p>
+<p align="justify">
+Los artículos 🖱️Clickbait buscan captar la atención de los lectores mediante la curiosidad, utilizando titulares que plantean preguntas o afirmaciones incompletas, sensacionalistas, exageradas o directamente engañosas. Estos titulares a menudo esconden la respuesta al clickbait hasta el final del artículo, obligando al lector a desplazarse a través de un sinfín de contenido irrelevante. El verdadero objetivo es atraer visitas a la página para exponer al usuario a una cantidad máxima de publicidad, sacrificando la calidad y el valor informativo en el proceso.
+</p>
+### ¿Por qué representa un problema?
+<p align="justify">
+La práctica del 🖱️Clickbait erosiona la confianza del público en las fuentes de noticias digitales y perjudica los ingresos publicitarios de los productores de contenido legítimo, que pueden experimentar una disminución en su tráfico web como resultado.
+</p>
+### ¿Qué acciones hemos tomado para abordar este desafío?
+- 📰 Hemos desarrollado NoticIA, una colección que incluye 850 artículos de noticias en español caracterizados por titulares clickbait. Cada artículo está acompañado de un resumen generativo de alta calidad y concisión, redactado por expertos humanos. Explora [🤗NoticIA-it](https://huggingface.co/datasets/somosnlp/NoticIA-it).
+- 📈 Evaluamos decenas de modelos de inteligencia artificial en este conjunto de datos. Los resultados se pueden consultar aquí: [NoticIA Benchmark](https://huggingface.co/somosnlp/Resumen_Noticias_Clickbait/resolve/main/Results_finetune.png).
+- 🤖 Entrenamos un avanzado modelo de lenguaje con 7 billones de parámetros específicamente con nuestro dataset, [🤗NoticIA-7B](https://huggingface.co/somosnlp/NoticIA-7B).
+<p align="justify">
+NoticIA ofrece un escenario ideal para probar la habilidad de los modelos de lenguaje en la comprensión de textos en español. Esta tarea es compleja que discernir la pregunta oculta en un titular clickbait o identificar la información que realmente busca el usuario. Este reto implica filtrar grandes volúmenes de contenido superfluo para hallar y resumir de manera precisa y sucinta la información relevante.
+</p>
+## ¿Cómo funciona esta demo?
+<p align="justify">
+Solo introduce la URL de un artículo clickbait en el campo de texto y haz clic en el botón "Generar resumen" para probarla.
+</p>
+## Mirando hacia el futuro
+- 📚 Planeamos expandir NoticIA con aún más artículos clickbait.
+- 🔮 Introduciremos etiquetas adicionales al conjunto de datos, incluyendo métricas que cuantifiquen el grado de clickbait de los artículos.
+- 📔 Estamos preparando un artículo para profundizar en los hallazgos y metodologías de nuestro proyecto.
+""".strip()
+article = """
+Esta demo ha sido creada por [Iker García-Ferrero](https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/) y [Begoña Altuna](https://www.linkedin.com/in/bego%C3%B1a-altuna-78014139). Somos investigadores en PLN en la Universidad del País Vasco, dentro del grupo de investigación [IXA](https://www.ixa.eus/) y formamos parte de [HiTZ, el Centro Vasco de Tecnología de la Lengua](https://www.hitz.eus/es).
+<div style="display: flex; justify-content: space-around; width: 100%;">
+  <div style="width: 50%;" align="left">
+    <a href="http://ixa.si.ehu.es/">
+      <img src="https://raw.githubusercontent.com/ikergarcia1996/Iker-Garcia-Ferrero/master/icons/ixa.png" width="50" height="50"  alt="Ixa NLP Group">
+    </a>
+  </div>
+  <div style="width: 50%;" align="right">
+    <a href="http://www.hitz.eus/">
+      <img src="https://raw.githubusercontent.com/ikergarcia1996/Iker-Garcia-Ferrero/master/icons/Hitz.png" width="300" height="50" alt="HiTZ Basque Center for Language Technologies">
+    </a>
+  </div>
+</div>
+""".strip()

prompts.py ADDED Viewed

	@@ -0,0 +1,31 @@

+def summarize_clickbait_short_prompt(
+    headline: str,
+    body: str,
+) -> str:
+    """
+    Generate the prompt for the model.
+    Args:
+        headline (`str`):
+            The headline of the article.
+        body (`str`):
+            The body of the article.
+    Returns:
+        `str`: The formatted prompt.
+    """
+    return (
+        f"Ahora eres una Inteligencia Artificial experta en desmontar titulares sensacionalistas o clickbait. "
+        f"Tu tarea consiste en analizar noticias con titulares sensacionalistas y "
+        f"generar un resumen de una sola frase que revele la verdad detrás del titular.\n"
+        f"Este es el titular de la noticia: {headline}\n"
+        f"El titular plantea una pregunta o proporciona información incompleta. "
+        f"Debes buscar en el cuerpo de la noticia una frase que responda lo que se sugiere en el título. "
+        f"Siempre que puedas cita el texto original, especialmente si se trata de una frase que alguien ha dicho. "
+        f"Si citas una frase que alguien ha dicho, usa comillas para indicar que es una cita. "
+        f"Usa siempre las mínimas palabras posibles. No es necesario que la respuesta sea una oración completa. "
+        f"Puede ser sólo el foco de la pregunta. "
+        f"Recuerda responder siempre en Español.\n"
+        f"Este es el cuerpo de la noticia:\n"
+        f"{body}"
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+setuptools
+gradio
+transformers
+numpy
+bitsandbytes
+newspaper3k

utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import logging
+from typing import List
+import torch
+from transformers import (
+    LogitsProcessor,
+)
+class StopAfterTokenIsGenerated(LogitsProcessor):
+    def __init__(self, stops: List[torch.tensor], eos_token_id: int):
+        super().__init__()
+        self.stops = stops
+        self.eos_token_id = eos_token_id
+        logging.info(f"Stopping criteria words ids: {self.stops}")
+        self.first_batch = True
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+                search or log softmax for each vocabulary token when using beam search
+        Return:
+            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
+        """
+        if self.first_batch:
+            self.first_batch = False
+            return scores
+        for seq_no, seq in enumerate(input_ids):
+            # logging.info(seq_no)
+            for stop in self.stops:
+                stop = stop.to(device=seq.device, dtype=seq.dtype)
+                if (
+                    len(seq) >= len(stop)
+                    and torch.all((stop == seq[-len(stop) :])).item()
+                ):
+                    scores[seq_no, :] = -float("inf")
+                    scores[seq_no, self.eos_token_id] = 0
+                    # logging.info(f"Stopping criteria found: {stop}")
+                    break
+        return scores
+    def reset(self):
+        self.first_batch = True