File size: 5,702 Bytes
5cc9457
04b8152
 
 
5cc9457
04b8152
 
5cc9457
04b8152
 
 
7090878
5cc9457
 
c1ef6fc
d8b957c
 
 
 
 
 
 
 
 
 
 
ab928f5
 
 
 
765bd01
ab928f5
765bd01
ab928f5
 
 
765bd01
 
ab928f5
 
 
 
 
 
1c1d715
ab928f5
 
 
 
 
 
 
 
 
 
 
 
9f9c2f1
ab928f5
c1ef6fc
 
 
ab928f5
765bd01
 
 
ab928f5
765bd01
 
ab928f5
 
 
 
 
 
 
 
765bd01
ab928f5
 
765bd01
ab928f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f9c2f1
ab928f5
 
 
 
 
 
 
 
 
d8b957c
db25a2e
 
 
 
ab928f5
 
 
 
765bd01
 
ab928f5
 
 
3096aed
ab928f5
 
 
 
 
 
 
3096aed
ab928f5
3096aed
 
 
 
69dca09
7074a60
 
04b8152
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import scattertext as sct
import spacy
import pandas as pd
import en_core_web_md
import streamlit as st


# load language model
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")

# Scopus file loading
st.title("Scattertext Analysis")
st.header("Put your file here... ")

def find_column_name(df, column_hint):
    """
    Searches through the DataFrame columns to find a match for a hint provided,
    ignoring case and whitespace differences.
    """
    column_hint = column_hint.lower().replace(' ', '')
    for col in df.columns:
        if col.lower().replace(' ', '') == column_hint:
            return col
    raise ValueError(f"Column matching '{column_hint}' not found.")

def compatison1(selected_column):
    # type_of_comparison 1
    row2_col1, row2_col2 = st.columns(2)
    with row2_col1:
        first_source = st.selectbox("Choose First Source", df[source_title_col].unique(), key='first_source_select')
    with row2_col2:
        second_source = st.selectbox("Choose Second Source", df[source_title_col].unique(),
                                     key='second_source_select')

    # filter data
    first_data = df[df[source_title_col] == first_source].copy()
    second_data = df[df[source_title_col] == second_source].copy()
    filtered_data = pd.concat([first_data, second_data])

    if st.button("Generate the Scattertext Plot"):
        # make plot
        corpus = sct.CorpusFromPandas(
            filtered_data,
            category_col= source_title_col,
            text_col= selected_column,
            nlp=nlp,
        ).build()
        # generate HTML visualization
        html = sct.produce_scattertext_explorer(corpus,
                                                category=first_source,
                                                category_name=first_source,
                                                not_category_name=second_source,
                                                width_in_pixels=900,
                                                minimum_term_frequency=0,
                                                metadata=filtered_data)
        st.components.v1.html(html, width=1000, height=600)
        st.download_button('Download the plot', html)
    return


        # type_of_comparison 2
def comparison2(selected_column):
    df[year_col] = pd.to_numeric(df[year_col], errors='coerce')
    df.dropna(subset=[year_col], inplace=True)
    df[year_col] = df[year_col].astype(int)

    min_year = int(df[year_col].min())
    max_year = int(df[year_col].max())
    # layout row2
    row2_col1, row2_col2 = st.columns(2)
    with row2_col1:
        first_range = st.slider("First range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))
    with row2_col2:
        second_range = st.slider("Second range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))

    # filter data
    first_range_filter_df = df[(df[year_col] >= first_range[0]) & (df[year_col] <= first_range[1])].copy()
    first_range_filter_df['Topic Range'] = 'First range'

    second_range_filter_df = df[(df[year_col] >= second_range[0]) & (df[year_col] <= second_range[1])].copy()
    second_range_filter_df['Topic Range'] = 'Second range'

    filtered_df = pd.concat([first_range_filter_df, second_range_filter_df])

    if st.button("Generate the Scattertext Plot"):
        # make plot
        corpus = sct.CorpusFromPandas(
            filtered_df,
            category_col="Topic Range",
            text_col= selected_column,
            nlp=nlp,
        ).build()
        # generate HTML visualization
        html = sct.produce_scattertext_explorer(corpus,
                                                category='First range',
                                                category_name='First range',
                                                not_category_name='Second range',
                                                width_in_pixels=900,
                                                minimum_term_frequency=0,
                                                metadata=filtered_df)
        st.components.v1.html(html, width=1000, height=600)
        st.download_button('Download the plot', html)
    return


if __name__ == '__main__':
    uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt"])
    if uploaded_file is not None:
        # determine file type
        if uploaded_file.name.endswith(".csv"):
            df = pd.read_csv(uploaded_file)
            source_title_col = find_column_name(df, 'Source Title')
            abstract_col = find_column_name(df, 'Abstract')
            title_col = find_column_name(df, 'Title')
            year_col = find_column_name(df, 'Year')
        # preview the uploaded file
        elif uploaded_file.name.endswith(".txt"):
            df = pd.read_table(uploaded_file, sep='\t')  # Doc: assume contents are seperated by Tabs.
            abstract_col = 'AB'
            title_col = 'TI'
            source_title_col = 'SO'
            year_col = 'PY'
            # preview the uploaded file
        else:
            st.error("Unsupported file format.")
            st.stop()

        column_choices = (abstract_col, title_col)

        row1_col1, row1_col2 = st.columns(2)
        with row1_col1:
            choice = st.selectbox("Choose column to analyze", column_choices)
        with row1_col2:
            comparison_options = ('Sources', 'Years')
            type_of_comparison = st.selectbox("Type of comparison", comparison_options)
        if type_of_comparison == 'Sources':
            compatison1(choice)
        if type_of_comparison == 'Years':
            comparison2(choice)