Spaces:
Running
Running
Update pages/4 Sunburst.py
Browse files- pages/4 Sunburst.py +83 -364
pages/4 Sunburst.py
CHANGED
@@ -1,19 +1,8 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
|
4 |
-
from nltk.tokenize import word_tokenize
|
5 |
-
from nltk.corpus import stopwords
|
6 |
-
import nltk
|
7 |
-
import spacy
|
8 |
-
from burst_detection import burst_detection, enumerate_bursts, burst_weights
|
9 |
-
import matplotlib.pyplot as plt
|
10 |
-
import os
|
11 |
-
import io
|
12 |
-
import math
|
13 |
import numpy as np
|
14 |
-
import plotly.graph_objects as go
|
15 |
-
from plotly.subplots import make_subplots
|
16 |
-
import plotly.io as pio
|
17 |
import sys
|
18 |
|
19 |
#===config===
|
@@ -42,384 +31,114 @@ with st.popover("🔗 Menu"):
|
|
42 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
43 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
44 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
45 |
-
|
46 |
-
st.header("
|
47 |
st.subheader('Put your file here...', anchor=False)
|
48 |
|
49 |
#===clear cache===
|
50 |
def reset_all():
|
51 |
st.cache_data.clear()
|
52 |
|
53 |
-
|
54 |
-
nlp = spacy.load("en_core_web_md")
|
55 |
-
|
56 |
@st.cache_data(ttl=3600)
|
57 |
-
def
|
58 |
-
df = pd.read_csv(uploaded_file)
|
59 |
-
#lens.org
|
60 |
-
if 'Publication Year' in df.columns:
|
61 |
-
df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
62 |
-
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
63 |
-
return df
|
64 |
-
|
65 |
-
@st.cache_data(ttl=3600)
|
66 |
-
def get_ext(uploaded_file):
|
67 |
extype = uploaded_file.name
|
68 |
return extype
|
69 |
|
70 |
@st.cache_data(ttl=3600)
|
71 |
-
def
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
76 |
|
77 |
@st.cache_data(ttl=3600)
|
78 |
def conv_txt(extype):
|
79 |
col_dict = {'TI': 'Title',
|
80 |
'SO': 'Source title',
|
81 |
'DT': 'Document Type',
|
|
|
|
|
82 |
'AB': 'Abstract',
|
83 |
-
'
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
# Helper Functions
|
89 |
-
@st.cache_data(ttl=3600)
|
90 |
-
def get_column_name(df, possible_names):
|
91 |
-
"""Find and return existing column names from a list of possible names."""
|
92 |
-
for name in possible_names:
|
93 |
-
if name in df.columns:
|
94 |
-
return name
|
95 |
-
raise ValueError(f"None of the possible names {possible_names} found in DataFrame columns.")
|
96 |
-
|
97 |
-
@st.cache_data(ttl=3600)
|
98 |
-
def preprocess_text(text):
|
99 |
-
"""Lemmatize and remove stopwords from text."""
|
100 |
-
return ' '.join([token.lemma_.lower() for token in nlp(text) if token.is_alpha and not token.is_stop])
|
101 |
-
|
102 |
-
@st.cache_data(ttl=3600)
|
103 |
-
def load_data(uploaded_file):
|
104 |
-
"""Load data from the uploaded file."""
|
105 |
-
extype = get_ext(uploaded_file)
|
106 |
-
if extype.endswith('.csv'):
|
107 |
-
df = upload(extype)
|
108 |
-
elif extype.endswith('.txt'):
|
109 |
-
df = conv_txt(extype)
|
110 |
-
|
111 |
-
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
|
112 |
-
df = df.dropna(subset=['Year'])
|
113 |
-
df['Year'] = df['Year'].astype(int)
|
114 |
-
|
115 |
-
if 'Title' in df.columns and 'Abstract' in df.columns:
|
116 |
-
coldf = ['Abstract', 'Title']
|
117 |
-
elif 'Title' in df.columns:
|
118 |
-
coldf = ['Title']
|
119 |
-
elif 'Abstract' in df.columns:
|
120 |
-
coldf = ['Abstract']
|
121 |
-
else:
|
122 |
-
coldf = sorted(df.select_dtypes(include=['object']).columns.tolist())
|
123 |
-
|
124 |
-
MIN, MAX, GAP = get_minmax(df)
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
@st.cache_data(ttl=3600)
|
129 |
-
def clean_data(df):
|
130 |
-
|
131 |
-
years = list(range(YEAR[0],YEAR[1]+1))
|
132 |
-
df = df.loc[df['Year'].isin(years)]
|
133 |
-
|
134 |
-
# Preprocess text
|
135 |
-
df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
|
136 |
-
|
137 |
-
# Vectorize processed text
|
138 |
-
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
|
139 |
-
X = vectorizer.fit_transform(df['processed'].tolist())
|
140 |
-
|
141 |
-
# Create DataFrame from the Document-Term Matrix (DTM)
|
142 |
-
dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
|
143 |
-
yearly_term_frequency = dtm.groupby(dtm.index).sum()
|
144 |
-
|
145 |
-
# User inputs for top words analysis and exclusions
|
146 |
-
excluded_words = [word.strip() for word in excluded_words_input.split(',')]
|
147 |
-
|
148 |
-
# Identify top words, excluding specified words
|
149 |
-
filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
|
150 |
-
top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
|
151 |
-
|
152 |
-
return yearly_term_frequency, top_words
|
153 |
-
|
154 |
-
@st.cache_data(ttl=3600)
|
155 |
-
def apply_burst_detection(top_words, data):
|
156 |
-
all_bursts_list = []
|
157 |
-
|
158 |
-
start_year = int(data.index.min())
|
159 |
-
end_year = int(data.index.max())
|
160 |
-
all_years = range(start_year, end_year + 1)
|
161 |
-
|
162 |
-
continuous_years = pd.Series(index=all_years, data=0) # Start with a series of zeros for all years
|
163 |
-
|
164 |
-
years = continuous_years.index.tolist()
|
165 |
-
|
166 |
-
all_freq_data = pd.DataFrame(index=years)
|
167 |
-
|
168 |
-
for i, word in enumerate(top_words, start=1):
|
169 |
-
# Update with actual counts where available
|
170 |
-
word_counts = data[word].reindex(continuous_years.index, fill_value=0)
|
171 |
-
|
172 |
-
# Convert years and counts to lists for burst detection
|
173 |
-
r = continuous_years.index.tolist() # List of all years
|
174 |
-
r = np.array(r, dtype=int)
|
175 |
-
d = word_counts.values.tolist() # non-zero counts
|
176 |
-
d = np.array(d, dtype=float)
|
177 |
-
y = r.copy()
|
178 |
-
|
179 |
-
if len(r) > 0 and len(d) > 0:
|
180 |
-
n = len(r)
|
181 |
-
q, d, r, p = burst_detection(d, r, n, s=2.0, gamma=1.0, smooth_win=1)
|
182 |
-
bursts = enumerate_bursts(q, word)
|
183 |
-
bursts = burst_weights(bursts, r, d, p)
|
184 |
-
all_bursts_list.append(bursts)
|
185 |
-
|
186 |
-
freq_data = yearly_term_frequency[word].reindex(years, fill_value=0)
|
187 |
-
all_freq_data[word] = freq_data
|
188 |
-
|
189 |
-
all_bursts = pd.concat(all_bursts_list, ignore_index=True)
|
190 |
-
|
191 |
-
num_unique_labels = len(all_bursts['label'].unique())
|
192 |
-
|
193 |
-
num_rows = math.ceil(top_n / 2)
|
194 |
-
|
195 |
-
if running_total == "Running total":
|
196 |
-
all_freq_data = all_freq_data.cumsum()
|
197 |
-
|
198 |
-
return all_bursts, all_freq_data, num_unique_labels, num_rows
|
199 |
-
|
200 |
-
@st.cache_data(ttl=3600)
|
201 |
-
def convert_df(df):
|
202 |
-
return df.to_csv().encode("utf-8")
|
203 |
-
|
204 |
-
@st.cache_data(ttl=3600)
|
205 |
-
def scattervis(bursts, freq_data):
|
206 |
-
freq_data.reset_index(inplace=True)
|
207 |
-
freq_data.rename(columns={"index": "Year"}, inplace=True)
|
208 |
-
|
209 |
-
freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
|
210 |
-
freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
|
211 |
-
wordlist = freq_data_melted["Category"].unique()
|
212 |
-
|
213 |
-
years = freq_data["Year"].tolist()
|
214 |
-
bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
215 |
-
bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
216 |
-
burst_points = []
|
217 |
-
|
218 |
-
for _, row in bursts.iterrows():
|
219 |
-
for year in range(row["begin"], row["end"] + 1):
|
220 |
-
burst_points.append((year, row["label"], row["weight"]))
|
221 |
-
|
222 |
-
burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
|
223 |
-
|
224 |
-
fig = go.Figure()
|
225 |
-
|
226 |
-
# scatter trace for burst points
|
227 |
-
fig.add_trace(go.Scatter(
|
228 |
-
x=burst_points_df["Year"],
|
229 |
-
y=burst_points_df["Category"],
|
230 |
-
mode='markers',
|
231 |
-
marker=dict(
|
232 |
-
symbol='square',
|
233 |
-
size=40,
|
234 |
-
color='red',
|
235 |
-
opacity=0.5),
|
236 |
-
hoverinfo='text',
|
237 |
-
text=burst_points_df["Weight"],
|
238 |
-
showlegend=False
|
239 |
-
))
|
240 |
-
|
241 |
-
# scatter trace for freq_data
|
242 |
-
fig.add_trace(go.Scatter(
|
243 |
-
x=freq_data_melted["Year"],
|
244 |
-
y=freq_data_melted["Category"],
|
245 |
-
mode='markers+text',
|
246 |
-
marker=dict(
|
247 |
-
symbol='square',
|
248 |
-
size=30,
|
249 |
-
color=freq_data_melted["Value"],
|
250 |
-
colorscale='Blues',
|
251 |
-
showscale=False),
|
252 |
-
text=freq_data_melted["Value"],
|
253 |
-
textposition="middle center",
|
254 |
-
textfont=dict(
|
255 |
-
size=16,
|
256 |
-
color=['white' if value > freq_data_melted["Value"].max()/2 else 'black' for value in freq_data_melted["Value"]])
|
257 |
-
))
|
258 |
-
|
259 |
-
min_year = min(years)
|
260 |
-
max_year = max(years)
|
261 |
-
|
262 |
-
fig.update_layout(
|
263 |
-
xaxis=dict(tickmode='linear', dtick=1, range=[(min_year-1), (max_year+1)], tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
|
264 |
-
yaxis=dict(tickvals=wordlist, ticktext=wordlist, tickmode='array', tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
|
265 |
-
plot_bgcolor='white',
|
266 |
-
paper_bgcolor='white',
|
267 |
-
showlegend=False,
|
268 |
-
margin=dict(l=1, r=1, t=1, b=1),
|
269 |
-
height=top_n*50+2,
|
270 |
-
width=(max_year-min_year)*52+100,
|
271 |
-
autosize=False
|
272 |
-
)
|
273 |
-
|
274 |
-
fig.write_image("scatter_plot.png")
|
275 |
-
st.image("scatter_plot.png")
|
276 |
-
pio.write_image(fig, 'result.png', scale=4)
|
277 |
-
|
278 |
-
@st.cache_data(ttl=3600)
|
279 |
-
def linegraph(bursts, freq_data):
|
280 |
-
fig = make_subplots(rows=num_rows, cols=2, subplot_titles=freq_data.columns[:top_n])
|
281 |
-
|
282 |
-
row, col = 1, 1
|
283 |
-
for i, column in enumerate(freq_data.columns[:top_n]):
|
284 |
-
fig.add_trace(go.Scatter(
|
285 |
-
x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
|
286 |
-
line_shape='linear',
|
287 |
-
hoverinfo='text',
|
288 |
-
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
|
289 |
-
text=freq_data[column],
|
290 |
-
textposition='top center'
|
291 |
-
), row=row, col=col)
|
292 |
-
|
293 |
-
# Add area charts
|
294 |
-
for _, row_data in bursts[bursts['label'] == column].iterrows():
|
295 |
-
x_values = freq_data.index[row_data['begin']:row_data['end']+1]
|
296 |
-
y_values = freq_data[column][row_data['begin']:row_data['end']+1]
|
297 |
-
|
298 |
-
#middle_y = sum(y_values) / len(y_values)
|
299 |
-
y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
|
300 |
-
x_offset = 0.1
|
301 |
-
|
302 |
-
# Add area chart
|
303 |
-
fig.add_trace(go.Scatter(
|
304 |
-
x=x_values,
|
305 |
-
y=y_values,
|
306 |
-
fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
|
307 |
-
), row=row, col=col)
|
308 |
-
|
309 |
-
align_value = "left" if running_total == "Running total" else "center"
|
310 |
-
valign_value = "bottom" if running_total == "Running total" else "middle"
|
311 |
-
|
312 |
-
# Add annotation for weight at the bottom
|
313 |
-
fig.add_annotation(
|
314 |
-
x=x_values[0] + x_offset,
|
315 |
-
y=y_post,
|
316 |
-
text=f"Weight: {row_data['weight']:.2f}",
|
317 |
-
showarrow=False,
|
318 |
-
font=dict(
|
319 |
-
color="black",
|
320 |
-
size=12),
|
321 |
-
align=align_value,
|
322 |
-
valign=valign_value,
|
323 |
-
textangle=270,
|
324 |
-
row=row, col=col
|
325 |
-
)
|
326 |
-
|
327 |
-
col += 1
|
328 |
-
if col > 2:
|
329 |
-
col = 1
|
330 |
-
row += 1
|
331 |
-
|
332 |
-
fig.update_layout(
|
333 |
-
showlegend=False,
|
334 |
-
margin=dict(l=20, r=20, t=100, b=20),
|
335 |
-
height=num_rows * 500,
|
336 |
-
width=1500
|
337 |
-
)
|
338 |
-
|
339 |
-
fig.write_image("line_graph.png")
|
340 |
-
st.image("line_graph.png")
|
341 |
-
pio.write_image(fig, 'result.png', scale=4)
|
342 |
-
|
343 |
-
@st.cache_data(ttl=3600)
|
344 |
-
def download_result(freq_data, bursts):
|
345 |
-
csv1 = convert_df(freq_data)
|
346 |
-
csv2 = convert_df(bursts)
|
347 |
-
return csv1, csv2
|
348 |
-
|
349 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
350 |
|
351 |
if uploaded_file is not None:
|
352 |
try:
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
d1, d2 = st.columns([3,7])
|
361 |
-
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
|
362 |
-
col_name = d1.selectbox("Select column to analyze",
|
363 |
-
(coldf), on_change=reset_all)
|
364 |
-
excluded_words_input = d2.text_input("Words to exclude (comma-separated)", on_change=reset_all)
|
365 |
-
|
366 |
-
if (GAP != 0):
|
367 |
-
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
368 |
-
else:
|
369 |
-
e1.write('You only have data in ', (MAX))
|
370 |
-
sys.exit(1)
|
371 |
-
|
372 |
-
yearly_term_frequency, top_words = clean_data(df)
|
373 |
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
else:
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
|
|
408 |
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
|
|
415 |
with tab2:
|
416 |
-
st.markdown('**
|
417 |
-
|
418 |
-
|
419 |
-
st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680')
|
420 |
-
st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
|
421 |
-
st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
|
422 |
-
|
423 |
except:
|
424 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
425 |
st.stop()
|
|
|
1 |
+
#===import module===
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
+
import plotly.express as px
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import numpy as np
|
|
|
|
|
|
|
6 |
import sys
|
7 |
|
8 |
#===config===
|
|
|
31 |
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
32 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
33 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
34 |
+
|
35 |
+
st.header("Sunburst Visualization", anchor=False)
|
36 |
st.subheader('Put your file here...', anchor=False)
|
37 |
|
38 |
#===clear cache===
|
39 |
def reset_all():
|
40 |
st.cache_data.clear()
|
41 |
|
42 |
+
#===check type===
|
|
|
|
|
43 |
@st.cache_data(ttl=3600)
|
44 |
+
def get_ext(extype):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
extype = uploaded_file.name
|
46 |
return extype
|
47 |
|
48 |
@st.cache_data(ttl=3600)
|
49 |
+
def upload(extype):
|
50 |
+
papers = pd.read_csv(uploaded_file)
|
51 |
+
#lens.org
|
52 |
+
if 'Publication Year' in papers.columns:
|
53 |
+
papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
|
54 |
+
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
|
55 |
+
return papers
|
56 |
|
57 |
@st.cache_data(ttl=3600)
|
58 |
def conv_txt(extype):
|
59 |
col_dict = {'TI': 'Title',
|
60 |
'SO': 'Source title',
|
61 |
'DT': 'Document Type',
|
62 |
+
'DE': 'Author Keywords',
|
63 |
+
'ID': 'Keywords Plus',
|
64 |
'AB': 'Abstract',
|
65 |
+
'TC': 'Cited by',
|
66 |
+
'PY': 'Year',}
|
67 |
+
papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
|
68 |
+
papers.rename(columns=col_dict, inplace=True)
|
69 |
+
return papers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
#===Read data===
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
73 |
|
74 |
if uploaded_file is not None:
|
75 |
try:
|
76 |
+
extype = get_ext(uploaded_file)
|
77 |
+
if extype.endswith('.csv'):
|
78 |
+
papers = upload(extype)
|
79 |
+
|
80 |
+
elif extype.endswith('.txt'):
|
81 |
+
papers = conv_txt(extype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
@st.cache_data(ttl=3600)
|
84 |
+
def get_minmax(extype):
|
85 |
+
extype = extype
|
86 |
+
MIN = int(papers['Year'].min())
|
87 |
+
MAX = int(papers['Year'].max())
|
88 |
+
GAP = MAX - MIN
|
89 |
+
return papers, MIN, MAX, GAP
|
90 |
+
|
91 |
+
tab1, tab2 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading"])
|
92 |
+
|
93 |
+
with tab1:
|
94 |
+
#===sunburst===
|
95 |
+
try:
|
96 |
+
papers, MIN, MAX, GAP = get_minmax(extype)
|
97 |
+
except KeyError:
|
98 |
+
st.error('Error: Please check again your columns.')
|
99 |
+
sys.exit(1)
|
100 |
+
|
101 |
+
if (GAP != 0):
|
102 |
+
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
103 |
else:
|
104 |
+
st.write('You only have data in ', (MAX))
|
105 |
+
YEAR = (MIN, MAX)
|
106 |
+
|
107 |
+
@st.cache_data(ttl=3600)
|
108 |
+
def listyear(extype):
|
109 |
+
global papers
|
110 |
+
years = list(range(YEAR[0],YEAR[1]+1))
|
111 |
+
papers = papers.loc[papers['Year'].isin(years)]
|
112 |
+
return years, papers
|
113 |
+
|
114 |
+
@st.cache_data(ttl=3600)
|
115 |
+
def vis_sunbrust(extype):
|
116 |
+
papers['Cited by'] = papers['Cited by'].fillna(0)
|
117 |
+
vis = pd.DataFrame()
|
118 |
+
vis[['doctype','source','citby','year']] = papers[['Document Type','Source title','Cited by','Year']]
|
119 |
+
viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index()
|
120 |
+
viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True)
|
121 |
+
|
122 |
+
fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs',
|
123 |
+
color='cited by',
|
124 |
+
color_continuous_scale='RdBu',
|
125 |
+
color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
|
126 |
+
fig.update_layout(height=800, width=1200)
|
127 |
+
return fig
|
128 |
+
|
129 |
+
years, papers = listyear(extype)
|
130 |
|
131 |
+
if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
|
132 |
+
fig = vis_sunbrust(extype)
|
133 |
+
st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
|
134 |
+
|
135 |
+
else:
|
136 |
+
st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨")
|
137 |
+
|
138 |
with tab2:
|
139 |
+
st.markdown('**numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual.** https://numpy.org/doc/stable/reference/generated/numpy.average.html')
|
140 |
+
st.markdown('**Sunburst. (n.d.). Sunburst Charts in Python.** https://plotly.com/python/sunburst-charts/')
|
141 |
+
|
|
|
|
|
|
|
|
|
142 |
except:
|
143 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
144 |
st.stop()
|