Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import feedparser | |
from datetime import datetime, timedelta | |
import pytz | |
from bs4 import BeautifulSoup | |
import hashlib | |
import threading | |
import pandas as pd | |
# Global settings | |
SUMMARIZER_MODELS = { | |
"Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn", | |
"Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6" | |
} | |
CACHE_SIZE = 500 | |
RSS_FETCH_INTERVAL = timedelta(hours=8) | |
ARTICLE_LIMIT = 5 | |
NEWS_SOURCES = { | |
"Movilizaciones Sindicales": { | |
"Pagina12": "https://www.pagina12.com.ar/rss/edicion-impresa", | |
} | |
} | |
class NewsCache: | |
def __init__(self, size): | |
self.cache = {} | |
self.size = size | |
self.lock = threading.Lock() | |
def get(self, key): | |
with self.lock: | |
return self.cache.get(key) | |
def set(self, key, value): | |
with self.lock: | |
if len(self.cache) >= self.size: | |
oldest_key = next(iter(self.cache)) | |
del self.cache[oldest_key] | |
self.cache[key] = value | |
cache = NewsCache(CACHE_SIZE) | |
def fetch_rss_news(categories): | |
articles = [] | |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL | |
for category in categories: | |
for source, url in NEWS_SOURCES.get(category, {}).items(): | |
try: | |
feed = feedparser.parse(url) | |
for entry in feed.entries: | |
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC) | |
if published > cutoff_time: | |
articles.append({ | |
"title": entry.title, | |
"description": BeautifulSoup(entry.description, "html.parser").get_text(), | |
"link": entry.link, | |
"category": category, | |
"source": source, | |
"published": published | |
}) | |
except Exception: | |
continue | |
articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT] | |
return articles | |
def summarize_text(text, model_name): | |
summarizer = pipeline("summarization", model=model_name, device=-1) | |
content_hash = hashlib.md5(text.encode()).hexdigest() | |
cached_summary = cache.get(content_hash) | |
if cached_summary: | |
return cached_summary | |
try: | |
result = summarizer(text, max_length=120, min_length=40, truncation=True) | |
summary = result[0]['summary_text'] | |
cache.set(content_hash, summary) | |
return summary | |
except Exception: | |
return "Summary unavailable." | |
def summarize_articles(articles, model_name): | |
summaries = [] | |
for article in articles: | |
content = article["description"] | |
summary = summarize_text(content, model_name) | |
summaries.append(f""" | |
📰 {article['title']} | |
- 📁 Category: {article['category']} | |
- 💡 Source: {article['source']} | |
- 🔗 Read More: {article['link']} | |
📃 Summary: {summary} | |
""") | |
return "\n".join(summaries) | |
def generate_summary(selected_categories, model_name): | |
if not selected_categories: | |
return "Please select at least one category." | |
articles = fetch_rss_news(selected_categories) | |
if not articles: | |
return "No recent news found in the selected categories." | |
return summarize_articles(articles, model_name) | |
def fetch_union_mobilizations(): | |
articles = [] | |
cutoff_time = datetime.now(pytz.UTC) - timedelta(days=1) | |
for source, url in NEWS_SOURCES["Movilizaciones Sindicales"].items(): | |
try: | |
feed = feedparser.parse(url) | |
for entry in feed.entries: | |
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC) | |
if published > cutoff_time: | |
# Filtrar por movilizaciones sindicales | |
if "movilización" in entry.title.lower() or "sindical" in entry.title.lower(): | |
articles.append({ | |
"title": entry.title, | |
"description": BeautifulSoup(entry.description, "html.parser").get_text(), | |
"link": entry.link, | |
"source": source, | |
"published": published | |
}) | |
except Exception: | |
continue | |
return articles | |
def create_mobilization_table(): | |
articles = fetch_union_mobilizations() | |
if not articles: | |
return "No se encontraron movilizaciones sindicales recientes." | |
# Crear una tabla con pandas | |
df = pd.DataFrame(articles) | |
return df.to_string(index=False) | |
# Gradio Interface | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("# 📰 AI News Summarizer") | |
with gr.Row(): | |
categories = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES.keys()), | |
label="Select News Categories" | |
) | |
model_selector = gr.Radio( | |
choices=list(SUMMARIZER_MODELS.keys()), | |
label="Choose Summarization Model", | |
value="Default (facebook/bart-large-cnn)" | |
) | |
summarize_button = gr.Button("Get News Summary") | |
summary_output = gr.Textbox(label="News Summary", lines=20) | |
def get_summary(selected_categories, selected_model): | |
model_name = SUMMARIZER_MODELS[selected_model] | |
return generate_summary(selected_categories, model_name) | |
summarize_button.click(get_summary, inputs=[categories, model_selector], outputs=summary_output) | |
if __name__ == "__main__": | |
demo.launch() | |