diegomrodrigues commited on
Commit
d734a6a
1 Parent(s): 80c5ad2

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. .github/workflows/update_space.yml +28 -0
  2. README.md +3 -8
  3. app.py +226 -0
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
README.md CHANGED
@@ -1,12 +1,7 @@
1
  ---
2
- title: Bert Topic Gradio
3
- emoji: 📚
4
- colorFrom: pink
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.38.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: bert-topic-gradio
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.38.1
 
 
6
  ---
7
+ # bert-topic-gradio
 
app.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from langchain_community.document_loaders import ArxivLoader
4
+ from langchain_community.document_loaders.merge import MergedDataLoader
5
+ from langchain_core.documents import Document
6
+ from typing import Iterator, List, Dict
7
+ from bertopic import BERTopic
8
+ from bertopic.representation import KeyBERTInspired
9
+ from umap import UMAP
10
+ import numpy as np
11
+ from collections import defaultdict
12
+
13
+ class CustomArxivLoader(ArxivLoader):
14
+ def __init__(self, **kwargs):
15
+ super().__init__(**kwargs)
16
+
17
+ def lazy_load(self) -> Iterator[Document]:
18
+ documents = super().lazy_load()
19
+
20
+ def update_metadata(documents):
21
+ for document in documents:
22
+ yield Document(
23
+ page_content=document.page_content,
24
+ metadata={
25
+ **document.metadata,
26
+ "ArxivId": self.query,
27
+ "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
28
+ }
29
+ )
30
+
31
+ return update_metadata(documents)
32
+
33
+ def upload_file(file):
34
+ if not ".json" in file.name:
35
+ return "Not Allowed"
36
+
37
+ print(f"Processing file: {file.name}")
38
+
39
+ with open(file.name, "r") as f:
40
+ results = json.load(f)
41
+
42
+ arxiv_urls = results["collected_urls"]["arxiv.org"]
43
+
44
+ print(f"Collected {len(arxiv_urls)} arxiv urls from file.")
45
+
46
+ arxiv_ids = map(lambda url: url.split("/")[-1].strip(".pdf"), arxiv_urls)
47
+
48
+ all_loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
49
+
50
+ merged_loader = MergedDataLoader(loaders=all_loaders)
51
+
52
+ documents = merged_loader.load()
53
+
54
+ print(f"Loaded {len(documents)} documents from file.")
55
+
56
+ return documents
57
+
58
+ def process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
59
+ if not documents:
60
+ return "No documents to process. Please upload a file first."
61
+
62
+ contents = [doc.page_content for doc in documents]
63
+
64
+ representation_model = KeyBERTInspired()
65
+
66
+ umap_model = UMAP(
67
+ n_neighbors=umap_n_neighbors,
68
+ n_components=umap_n_components,
69
+ min_dist=umap_min_dist,
70
+ metric='cosine'
71
+ )
72
+
73
+ topic_model = BERTopic(
74
+ language="english",
75
+ verbose=True,
76
+ umap_model=umap_model,
77
+ min_topic_size=min_topic_size,
78
+ representation_model=representation_model,
79
+ nr_topics=nr_topics
80
+ )
81
+
82
+ topics, _ = topic_model.fit_transform(contents)
83
+
84
+ topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
85
+
86
+ print(f"Generated {len(topic_labels)} topics from data.")
87
+ print("Topic Labels: ", topic_labels)
88
+
89
+ return documents, topics.tolist() if isinstance(topics, np.ndarray) else topics, topic_labels
90
+
91
+ def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
92
+ if not documents:
93
+ return []
94
+ results = []
95
+ for i, (doc, topic) in enumerate(zip(documents, topics)):
96
+ label = labels[topic]
97
+ results.append([str(i), label, doc.metadata['Title']])
98
+ return results
99
+
100
+ def get_unique_topics(labels: List[str]) -> List[str]:
101
+ return list(set(labels))
102
+
103
+ def remove_topics(documents: List[Document], topics: List[int], labels: List[str], topics_to_remove: List[str]) -> tuple:
104
+ new_documents = []
105
+ new_topics = []
106
+ new_labels = []
107
+
108
+ for doc, topic, label in zip(documents, topics, labels):
109
+ if label not in topics_to_remove:
110
+ new_documents.append(doc)
111
+ new_topics.append(topic)
112
+ new_labels.append(label)
113
+
114
+ return new_documents, new_topics, new_labels
115
+
116
+ def create_markdown_content(documents: List[Document], labels: List[str]) -> str:
117
+ if not documents or not labels:
118
+ return "No data available for download."
119
+
120
+ topic_documents = defaultdict(list)
121
+ for doc, label in zip(documents, labels):
122
+ topic_documents[label].append(doc)
123
+
124
+ full_text = "# Arxiv Articles by Topic\n\n"
125
+
126
+ for topic, docs in topic_documents.items():
127
+ full_text += f"## {topic}\n\n"
128
+
129
+ for document in docs:
130
+ full_text += f"### {document.metadata['Title']}\n\n"
131
+ full_text += f"{document.metadata['Summary']}\n\n"
132
+
133
+ return full_text
134
+
135
+ with gr.Blocks(theme="default") as demo:
136
+ gr.Markdown("# Bert Topic Article Organizer App")
137
+ gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")
138
+
139
+ state = gr.State(value=[])
140
+
141
+ with gr.Row():
142
+ file_uploader = gr.UploadButton(
143
+ "Click to upload",
144
+ file_types=["json"],
145
+ file_count="single"
146
+ )
147
+ reprocess_button = gr.Button("Reprocess Documents")
148
+ download_button = gr.Button("Download Results")
149
+
150
+ with gr.Row():
151
+ with gr.Column():
152
+ umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
153
+ umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
154
+ umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
155
+ with gr.Column():
156
+ min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
157
+ nr_topics = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic nr_topics")
158
+
159
+ with gr.Row():
160
+ output_matrix = gr.DataFrame(
161
+ label="Processing Result",
162
+ headers=["ID", "Topic", "Title"],
163
+ col_count=(3, "fixed"),
164
+ interactive=False
165
+ )
166
+
167
+ with gr.Row():
168
+ topic_dropdown = gr.Dropdown(
169
+ label="Select Topics to Remove",
170
+ multiselect=True,
171
+ interactive=True
172
+ )
173
+ remove_topics_button = gr.Button("Remove Selected Topics")
174
+
175
+ markdown_output = gr.File(label="Download Markdown", visible=False)
176
+
177
+ def update_ui(documents, topics, labels):
178
+ matrix = create_docs_matrix(documents, topics, labels)
179
+ unique_topics = get_unique_topics(labels)
180
+ return matrix, unique_topics
181
+
182
+ def process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
183
+ documents = state if state else []
184
+ new_documents, new_topics, new_labels = process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
185
+ matrix, unique_topics = update_ui(new_documents, new_topics, new_labels)
186
+ return [new_documents, new_topics, new_labels], matrix, unique_topics
187
+
188
+ file_uploader.upload(
189
+ fn=lambda file: upload_file(file),
190
+ inputs=[file_uploader],
191
+ outputs=[state]
192
+ ).then(
193
+ fn=process_and_update,
194
+ inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
195
+ outputs=[state, output_matrix, topic_dropdown]
196
+ )
197
+
198
+ reprocess_button.click(
199
+ fn=process_and_update,
200
+ inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
201
+ outputs=[state, output_matrix, topic_dropdown]
202
+ )
203
+
204
+ def remove_and_update(state, topics_to_remove, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
205
+ documents, topics, labels = state
206
+ new_documents, new_topics, new_labels = remove_topics(documents, topics, labels, topics_to_remove)
207
+ return process_and_update([new_documents, new_topics, new_labels], umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
208
+
209
+ remove_topics_button.click(
210
+ fn=remove_and_update,
211
+ inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
212
+ outputs=[state, output_matrix, topic_dropdown]
213
+ )
214
+
215
+ def create_download_file(state):
216
+ documents, _, labels = state
217
+ content = create_markdown_content(documents, labels)
218
+ return gr.File(value=content, visible=True, filename="arxiv_articles_by_topic.md")
219
+
220
+ download_button.click(
221
+ fn=create_download_file,
222
+ inputs=[state],
223
+ outputs=[markdown_output]
224
+ )
225
+
226
+ demo.launch(share=True, show_error=True, max_threads=10, debug=True)