Spaces:

diegomrodrigues
/

bert-topic-gradio

Running

App Files Files Community

diegomrodrigues commited on Jul 18, 2024

Commit

d734a6a

verified ·

1 Parent(s): 80c5ad2

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

.github/workflows/update_space.yml +28 -0
README.md +3 -8
app.py +226 -0

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

README.md CHANGED Viewed

@@ -1,12 +1,7 @@
 ---
-title: Bert Topic Gradio
-emoji: 📚
-colorFrom: pink
-colorTo: blue
 sdk: gradio
 sdk_version: 4.38.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: bert-topic-gradio
+app_file: app.py
 sdk: gradio
 sdk_version: 4.38.1
 ---
+# bert-topic-gradio

app.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import gradio as gr
+import json
+from langchain_community.document_loaders import ArxivLoader
+from langchain_community.document_loaders.merge import MergedDataLoader
+from langchain_core.documents import Document
+from typing import Iterator, List, Dict
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from umap import UMAP
+import numpy as np
+from collections import defaultdict
+class CustomArxivLoader(ArxivLoader):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def lazy_load(self) -> Iterator[Document]:
+        documents = super().lazy_load()
+        def update_metadata(documents):
+            for document in documents:
+                yield Document(
+                    page_content=document.page_content,
+                    metadata={
+                        **document.metadata,
+                        "ArxivId": self.query,
+                        "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
+                    }
+                )
+        return update_metadata(documents)
+def upload_file(file):
+    if not ".json" in file.name:
+        return "Not Allowed"
+    print(f"Processing file: {file.name}")
+    with open(file.name, "r") as f:
+        results = json.load(f)
+    arxiv_urls = results["collected_urls"]["arxiv.org"]
+    print(f"Collected {len(arxiv_urls)} arxiv urls from file.")
+    arxiv_ids = map(lambda url: url.split("/")[-1].strip(".pdf"), arxiv_urls)
+    all_loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
+    merged_loader = MergedDataLoader(loaders=all_loaders)
+    documents = merged_loader.load()
+    print(f"Loaded {len(documents)} documents from file.")
+    return documents
+def process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
+    if not documents:
+        return "No documents to process. Please upload a file first."
+    contents = [doc.page_content for doc in documents]
+    representation_model = KeyBERTInspired()
+    umap_model = UMAP(
+        n_neighbors=umap_n_neighbors,
+        n_components=umap_n_components,
+        min_dist=umap_min_dist,
+        metric='cosine'
+    )
+    topic_model = BERTopic(
+        language="english",
+        verbose=True,
+        umap_model=umap_model,
+        min_topic_size=min_topic_size,
+        representation_model=representation_model,
+        nr_topics=nr_topics
+    )
+    topics, _ = topic_model.fit_transform(contents)
+    topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
+    print(f"Generated {len(topic_labels)} topics from data.")
+    print("Topic Labels: ", topic_labels)
+    return documents, topics.tolist() if isinstance(topics, np.ndarray) else topics, topic_labels
+def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
+    if not documents:
+        return []
+    results = []
+    for i, (doc, topic) in enumerate(zip(documents, topics)):
+        label = labels[topic]
+        results.append([str(i), label, doc.metadata['Title']])
+    return results
+def get_unique_topics(labels: List[str]) -> List[str]:
+    return list(set(labels))
+def remove_topics(documents: List[Document], topics: List[int], labels: List[str], topics_to_remove: List[str]) -> tuple:
+    new_documents = []
+    new_topics = []
+    new_labels = []
+    for doc, topic, label in zip(documents, topics, labels):
+        if label not in topics_to_remove:
+            new_documents.append(doc)
+            new_topics.append(topic)
+            new_labels.append(label)
+    return new_documents, new_topics, new_labels
+def create_markdown_content(documents: List[Document], labels: List[str]) -> str:
+    if not documents or not labels:
+        return "No data available for download."
+    topic_documents = defaultdict(list)
+    for doc, label in zip(documents, labels):
+        topic_documents[label].append(doc)
+    full_text = "# Arxiv Articles by Topic\n\n"
+    for topic, docs in topic_documents.items():
+        full_text += f"## {topic}\n\n"
+        for document in docs:
+            full_text += f"### {document.metadata['Title']}\n\n"
+            full_text += f"{document.metadata['Summary']}\n\n"
+    return full_text
+with gr.Blocks(theme="default") as demo:
+    gr.Markdown("# Bert Topic Article Organizer App")
+    gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")
+    state = gr.State(value=[])
+    with gr.Row():
+        file_uploader = gr.UploadButton(
+            "Click to upload",
+            file_types=["json"],
+            file_count="single"
+        )
+        reprocess_button = gr.Button("Reprocess Documents")
+        download_button = gr.Button("Download Results")
+    with gr.Row():
+        with gr.Column():
+            umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
+            umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
+            umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
+        with gr.Column():
+            min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
+            nr_topics = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic nr_topics")
+    with gr.Row():
+        output_matrix = gr.DataFrame(
+            label="Processing Result",
+            headers=["ID", "Topic", "Title"],
+            col_count=(3, "fixed"),
+            interactive=False
+        )
+    with gr.Row():
+        topic_dropdown = gr.Dropdown(
+            label="Select Topics to Remove",
+            multiselect=True,
+            interactive=True
+        )
+        remove_topics_button = gr.Button("Remove Selected Topics")
+    markdown_output = gr.File(label="Download Markdown", visible=False)
+    def update_ui(documents, topics, labels):
+        matrix = create_docs_matrix(documents, topics, labels)
+        unique_topics = get_unique_topics(labels)
+        return matrix, unique_topics
+    def process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
+        documents = state if state else []
+        new_documents, new_topics, new_labels = process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
+        matrix, unique_topics = update_ui(new_documents, new_topics, new_labels)
+        return [new_documents, new_topics, new_labels], matrix, unique_topics
+    file_uploader.upload(
+        fn=lambda file: upload_file(file),
+        inputs=[file_uploader],
+        outputs=[state]
+    ).then(
+        fn=process_and_update,
+        inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
+        outputs=[state, output_matrix, topic_dropdown]
+    )
+    reprocess_button.click(
+        fn=process_and_update,
+        inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
+        outputs=[state, output_matrix, topic_dropdown]
+    )
+    def remove_and_update(state, topics_to_remove, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
+        documents, topics, labels = state
+        new_documents, new_topics, new_labels = remove_topics(documents, topics, labels, topics_to_remove)
+        return process_and_update([new_documents, new_topics, new_labels], umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
+    remove_topics_button.click(
+        fn=remove_and_update,
+        inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
+        outputs=[state, output_matrix, topic_dropdown]
+    )
+    def create_download_file(state):
+        documents, _, labels = state
+        content = create_markdown_content(documents, labels)
+        return gr.File(value=content, visible=True, filename="arxiv_articles_by_topic.md")
+    download_button.click(
+        fn=create_download_file,
+        inputs=[state],
+        outputs=[markdown_output]
+    )
+demo.launch(share=True, show_error=True, max_threads=10, debug=True)