Spaces:

faizhalas
/

coconut

Running

App Files Files Community

faizhalas commited on Aug 5, 2024

Commit

19b9cbc

verified ·

1 Parent(s): 33a4df1

Update pages/4 Sunburst.py

Browse files

Files changed (1) hide show

pages/4 Sunburst.py +83 -364

pages/4 Sunburst.py CHANGED Viewed

@@ -1,19 +1,8 @@
 import streamlit as st
 import pandas as pd
-from sklearn.feature_extraction.text import CountVectorizer
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-import nltk
-import spacy
-from burst_detection import burst_detection, enumerate_bursts, burst_weights
-import matplotlib.pyplot as plt
-import os
-import io
-import math
 import numpy as np
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import plotly.io as pio
 import sys
 #===config===
@@ -42,384 +31,114 @@ with st.popover("🔗 Menu"):
     st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
     st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
     st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
-st.header("Burst Detection", anchor=False)
 st.subheader('Put your file here...', anchor=False)
 #===clear cache===
 def reset_all():
     st.cache_data.clear()
-# Initialize NLP model
-nlp = spacy.load("en_core_web_md")
 @st.cache_data(ttl=3600)
-def upload(extype):
-    df = pd.read_csv(uploaded_file)
-    #lens.org
-    if 'Publication Year' in df.columns:
-               df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
-                                     'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
-    return df
-@st.cache_data(ttl=3600)
-def get_ext(uploaded_file):
     extype = uploaded_file.name
     return extype
 @st.cache_data(ttl=3600)
-def get_minmax(df):
-    MIN = int(df['Year'].min())
-    MAX = int(df['Year'].max())
-    GAP = MAX - MIN
-    return MIN, MAX, GAP
 @st.cache_data(ttl=3600)
 def conv_txt(extype):
     col_dict = {'TI': 'Title',
             'SO': 'Source title',
             'DT': 'Document Type',
             'AB': 'Abstract',
-            'PY': 'Year'}
-    df = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
-    df.rename(columns=col_dict, inplace=True)
-    return df
-# Helper Functions
-@st.cache_data(ttl=3600)
-def get_column_name(df, possible_names):
-    """Find and return existing column names from a list of possible names."""
-    for name in possible_names:
-        if name in df.columns:
-            return name
-    raise ValueError(f"None of the possible names {possible_names} found in DataFrame columns.")
-@st.cache_data(ttl=3600)
-def preprocess_text(text):
-    """Lemmatize and remove stopwords from text."""
-    return ' '.join([token.lemma_.lower() for token in nlp(text) if token.is_alpha and not token.is_stop])
-@st.cache_data(ttl=3600)
-def load_data(uploaded_file):
-    """Load data from the uploaded file."""
-    extype = get_ext(uploaded_file)
-    if extype.endswith('.csv'):
-         df = upload(extype)
-    elif extype.endswith('.txt'):
-         df = conv_txt(extype)
-    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
-    df = df.dropna(subset=['Year'])
-    df['Year'] = df['Year'].astype(int)
-    if 'Title' in df.columns and 'Abstract' in df.columns:
-        coldf = ['Abstract', 'Title']
-    elif 'Title' in df.columns:
-        coldf = ['Title']
-    elif 'Abstract' in df.columns:
-        coldf = ['Abstract']
-    else:
-        coldf = sorted(df.select_dtypes(include=['object']).columns.tolist())
-    MIN, MAX, GAP = get_minmax(df)
-    return df, coldf, MIN, MAX, GAP
-@st.cache_data(ttl=3600)
-def clean_data(df):
-    years = list(range(YEAR[0],YEAR[1]+1))
-    df = df.loc[df['Year'].isin(years)]
-    # Preprocess text
-    df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
-    # Vectorize processed text
-    vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
-    X = vectorizer.fit_transform(df['processed'].tolist())
-    # Create DataFrame from the Document-Term Matrix (DTM)
-    dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
-    yearly_term_frequency = dtm.groupby(dtm.index).sum()
-    # User inputs for top words analysis and exclusions
-    excluded_words = [word.strip() for word in excluded_words_input.split(',')]
-    # Identify top words, excluding specified words
-    filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
-    top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
-    return yearly_term_frequency, top_words
-@st.cache_data(ttl=3600)
-def apply_burst_detection(top_words, data):
-    all_bursts_list = []
-    start_year = int(data.index.min())
-    end_year = int(data.index.max())
-    all_years = range(start_year, end_year + 1)
-    continuous_years = pd.Series(index=all_years, data=0)  # Start with a series of zeros for all years
-    years = continuous_years.index.tolist()
-    all_freq_data = pd.DataFrame(index=years)
-    for i, word in enumerate(top_words, start=1):
-        # Update with actual counts where available
-        word_counts = data[word].reindex(continuous_years.index, fill_value=0)
-        # Convert years and counts to lists for burst detection
-        r = continuous_years.index.tolist()  # List of all years
-        r = np.array(r, dtype=int)
-        d = word_counts.values.tolist()  # non-zero counts
-        d = np.array(d, dtype=float)
-        y = r.copy()
-        if len(r) > 0 and len(d) > 0:
-            n = len(r)
-            q, d, r, p = burst_detection(d, r, n, s=2.0, gamma=1.0, smooth_win=1)
-            bursts = enumerate_bursts(q, word)
-            bursts = burst_weights(bursts, r, d, p)
-            all_bursts_list.append(bursts)
-            freq_data = yearly_term_frequency[word].reindex(years, fill_value=0)
-            all_freq_data[word] = freq_data
-    all_bursts = pd.concat(all_bursts_list, ignore_index=True)
-    num_unique_labels = len(all_bursts['label'].unique())
-    num_rows = math.ceil(top_n / 2)
-    if running_total == "Running total":
-        all_freq_data = all_freq_data.cumsum()
-    return all_bursts, all_freq_data, num_unique_labels, num_rows
-@st.cache_data(ttl=3600)
-def convert_df(df):
-    return df.to_csv().encode("utf-8")
-@st.cache_data(ttl=3600)
-def scattervis(bursts, freq_data):
-    freq_data.reset_index(inplace=True)
-    freq_data.rename(columns={"index": "Year"}, inplace=True)
-    freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
-    freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
-    wordlist = freq_data_melted["Category"].unique()
-    years = freq_data["Year"].tolist()
-    bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
-    bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
-    burst_points = []
-    for _, row in bursts.iterrows():
-        for year in range(row["begin"], row["end"] + 1):
-            burst_points.append((year, row["label"], row["weight"]))
-    burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
-    fig = go.Figure()
-    # scatter trace for burst points
-    fig.add_trace(go.Scatter(
-        x=burst_points_df["Year"],
-        y=burst_points_df["Category"],
-        mode='markers',
-        marker=dict(
-            symbol='square',
-            size=40,
-            color='red',
-            opacity=0.5),
-        hoverinfo='text',
-        text=burst_points_df["Weight"],
-        showlegend=False
-    ))
-    # scatter trace for freq_data
-    fig.add_trace(go.Scatter(
-        x=freq_data_melted["Year"],
-        y=freq_data_melted["Category"],
-        mode='markers+text',
-        marker=dict(
-            symbol='square',
-            size=30,
-            color=freq_data_melted["Value"],
-            colorscale='Blues',
-            showscale=False),
-        text=freq_data_melted["Value"],
-        textposition="middle center",
-        textfont=dict(
-            size=16,
-            color=['white' if value > freq_data_melted["Value"].max()/2 else 'black' for value in freq_data_melted["Value"]])
-    ))
-    min_year = min(years)
-    max_year = max(years)
-    fig.update_layout(
-        xaxis=dict(tickmode='linear', dtick=1, range=[(min_year-1), (max_year+1)], tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
-        yaxis=dict(tickvals=wordlist, ticktext=wordlist, tickmode='array', tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
-        plot_bgcolor='white',
-        paper_bgcolor='white',
-        showlegend=False,
-        margin=dict(l=1, r=1, t=1, b=1),
-        height=top_n*50+2,
-        width=(max_year-min_year)*52+100,
-        autosize=False
-    )
-    fig.write_image("scatter_plot.png")
-    st.image("scatter_plot.png")
-    pio.write_image(fig, 'result.png', scale=4)
-@st.cache_data(ttl=3600)
-def linegraph(bursts, freq_data):
-    fig = make_subplots(rows=num_rows, cols=2, subplot_titles=freq_data.columns[:top_n])
-    row, col = 1, 1
-    for i, column in enumerate(freq_data.columns[:top_n]):
-        fig.add_trace(go.Scatter(
-            x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
-            line_shape='linear',
-            hoverinfo='text',
-            hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
-            text=freq_data[column],
-            textposition='top center'
-        ), row=row, col=col)
-        # Add area charts
-        for _, row_data in bursts[bursts['label'] == column].iterrows():
-            x_values = freq_data.index[row_data['begin']:row_data['end']+1]
-            y_values = freq_data[column][row_data['begin']:row_data['end']+1]
-            #middle_y = sum(y_values) / len(y_values)
-            y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
-            x_offset = 0.1
-            # Add area chart
-            fig.add_trace(go.Scatter(
-                x=x_values,
-                y=y_values,
-                fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
-            ), row=row, col=col)
-            align_value = "left" if running_total == "Running total" else "center"
-            valign_value = "bottom" if running_total == "Running total" else "middle"
-            # Add annotation for weight at the bottom
-            fig.add_annotation(
-                x=x_values[0] + x_offset,
-                y=y_post,
-                text=f"Weight: {row_data['weight']:.2f}",
-                showarrow=False,
-                font=dict(
-                    color="black",
-                    size=12),
-                align=align_value,
-                valign=valign_value,
-                textangle=270,
-                row=row, col=col
-                )
-        col += 1
-        if col > 2:
-            col = 1
-            row += 1
-    fig.update_layout(
-        showlegend=False,
-        margin=dict(l=20, r=20, t=100, b=20),
-        height=num_rows * 500,
-        width=1500
-    )
-    fig.write_image("line_graph.png")
-    st.image("line_graph.png")
-    pio.write_image(fig, 'result.png', scale=4)
-@st.cache_data(ttl=3600)
-def download_result(freq_data, bursts):
-    csv1 = convert_df(freq_data)
-    csv2 = convert_df(bursts)
-    return csv1, csv2
 uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
 if uploaded_file is not None:
     try:
-        c1, c2, c3 = st.columns([3,3.5,3.5])
-        top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
-        viz_selected = c2.selectbox("Option for visualization",
-            ("Line graph", "Scatter plot"), on_change=reset_all)
-        running_total = c3.selectbox("Option for counting words",
-            ("Running total", "By occurrences each year"), on_change=reset_all)
-        d1, d2 = st.columns([3,7])
-        df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
-        col_name = d1.selectbox("Select column to analyze",
-            (coldf), on_change=reset_all)
-        excluded_words_input = d2.text_input("Words to exclude (comma-separated)", on_change=reset_all)
-        if (GAP != 0):
-            YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
-        else:
-            e1.write('You only have data in ', (MAX))
-            sys.exit(1)
-        yearly_term_frequency, top_words = clean_data(df)
-        bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
-        tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
-        with tab1:
-            if bursts.empty:
-                st.warning('We cannot detect any bursts', icon='⚠️')
             else:
-                if num_unique_labels == top_n:
-                    st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
-                elif num_unique_labels < top_n:
-                    st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
-                if viz_selected == "Line graph":
-                    linegraph(bursts, freq_data)
-                elif viz_selected =="Scatter plot":
-                    scattervis(bursts, freq_data)
-                csv1, csv2 = download_result(freq_data, bursts)
-                e1, e2, e3 = st.columns(3)
-                with open('result.png', "rb") as file:
-                    btn = e1.download_button(
-                        label="📊 Download high resolution image",
-                        data=file,
-                        file_name="burst.png",
-                        mime="image/png")
-                e2.download_button(
-                    "👉 Press to download list of top words",
-                    csv1,
-                    "top-keywords.csv",
-                    "text/csv")
-                e3.download_button(
-                    "👉 Press to download the list of detected bursts",
-                    csv2,
-                    "burst.csv",
-                    "text/csv")
         with tab2:
-            st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
-        with tab3:
-            st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680')
-            st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
-            st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
     except:
         st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
         st.stop()

+#===import module===
 import streamlit as st
 import pandas as pd
+import plotly.express as px
 import numpy as np
 import sys
 #===config===
     st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
     st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
     st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
+st.header("Sunburst Visualization", anchor=False)
 st.subheader('Put your file here...', anchor=False)
 #===clear cache===
 def reset_all():
     st.cache_data.clear()
+#===check type===
 @st.cache_data(ttl=3600)
+def get_ext(extype):
     extype = uploaded_file.name
     return extype
 @st.cache_data(ttl=3600)
+def upload(extype):
+    papers = pd.read_csv(uploaded_file)
+    #lens.org
+    if 'Publication Year' in papers.columns:
+            papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
+                                    'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
+    return papers
 @st.cache_data(ttl=3600)
 def conv_txt(extype):
     col_dict = {'TI': 'Title',
             'SO': 'Source title',
             'DT': 'Document Type',
+            'DE': 'Author Keywords',
+            'ID': 'Keywords Plus',
             'AB': 'Abstract',
+            'TC': 'Cited by',
+            'PY': 'Year',}
+    papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
+    papers.rename(columns=col_dict, inplace=True)
+    return papers
+#===Read data===
 uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
 if uploaded_file is not None:
     try:
+        extype = get_ext(uploaded_file)
+        if extype.endswith('.csv'):
+             papers = upload(extype)
+        elif extype.endswith('.txt'):
+             papers = conv_txt(extype)
+        @st.cache_data(ttl=3600)
+        def get_minmax(extype):
+            extype = extype
+            MIN = int(papers['Year'].min())
+            MAX = int(papers['Year'].max())
+            GAP = MAX - MIN
+            return papers, MIN, MAX, GAP
+        tab1, tab2 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading"])
+        with tab1:
+            #===sunburst===
+            try:
+                papers, MIN, MAX, GAP = get_minmax(extype)
+            except KeyError:
+                st.error('Error: Please check again your columns.')
+                sys.exit(1)
+            if (GAP != 0):
+                YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
             else:
+                st.write('You only have data in ', (MAX))
+                YEAR = (MIN, MAX)
+            @st.cache_data(ttl=3600)
+            def listyear(extype):
+                global papers
+                years = list(range(YEAR[0],YEAR[1]+1))
+                papers = papers.loc[papers['Year'].isin(years)]
+                return years, papers
+            @st.cache_data(ttl=3600)
+            def vis_sunbrust(extype):
+                papers['Cited by'] = papers['Cited by'].fillna(0)
+                vis = pd.DataFrame()
+                vis[['doctype','source','citby','year']] = papers[['Document Type','Source title','Cited by','Year']]
+                viz=vis.groupby(['doctype', 'source', 'year'])['citby'].agg(['sum','count']).reset_index()
+                viz.rename(columns={'sum': 'cited by', 'count': 'total docs'}, inplace=True)
+                fig = px.sunburst(viz, path=['doctype', 'source', 'year'], values='total docs',
+                              color='cited by',
+                              color_continuous_scale='RdBu',
+                              color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
+                fig.update_layout(height=800, width=1200)
+                return fig
+            years, papers = listyear(extype)
+            if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
+                fig = vis_sunbrust(extype)
+                st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
+            else:
+                st.error('We require these columns: Document Type, Source title, Cited by, Year', icon="🚨")
         with tab2:
+            st.markdown('**numpy.average — NumPy v1.24 Manual. (n.d.). Numpy.Average — NumPy v1.24 Manual.** https://numpy.org/doc/stable/reference/generated/numpy.average.html')
+            st.markdown('**Sunburst. (n.d.). Sunburst Charts in Python.** https://plotly.com/python/sunburst-charts/')
     except:
         st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
         st.stop()