Spaces:
Runtime error
Runtime error
File size: 5,702 Bytes
5cc9457 04b8152 5cc9457 04b8152 5cc9457 04b8152 7090878 5cc9457 c1ef6fc d8b957c ab928f5 765bd01 ab928f5 765bd01 ab928f5 765bd01 ab928f5 1c1d715 ab928f5 9f9c2f1 ab928f5 c1ef6fc ab928f5 765bd01 ab928f5 765bd01 ab928f5 765bd01 ab928f5 765bd01 ab928f5 9f9c2f1 ab928f5 d8b957c db25a2e ab928f5 765bd01 ab928f5 3096aed ab928f5 3096aed ab928f5 3096aed 69dca09 7074a60 04b8152 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import scattertext as sct
import spacy
import pandas as pd
import en_core_web_md
import streamlit as st
# load language model
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")
# Scopus file loading
st.title("Scattertext Analysis")
st.header("Put your file here... ")
def find_column_name(df, column_hint):
"""
Searches through the DataFrame columns to find a match for a hint provided,
ignoring case and whitespace differences.
"""
column_hint = column_hint.lower().replace(' ', '')
for col in df.columns:
if col.lower().replace(' ', '') == column_hint:
return col
raise ValueError(f"Column matching '{column_hint}' not found.")
def compatison1(selected_column):
# type_of_comparison 1
row2_col1, row2_col2 = st.columns(2)
with row2_col1:
first_source = st.selectbox("Choose First Source", df[source_title_col].unique(), key='first_source_select')
with row2_col2:
second_source = st.selectbox("Choose Second Source", df[source_title_col].unique(),
key='second_source_select')
# filter data
first_data = df[df[source_title_col] == first_source].copy()
second_data = df[df[source_title_col] == second_source].copy()
filtered_data = pd.concat([first_data, second_data])
if st.button("Generate the Scattertext Plot"):
# make plot
corpus = sct.CorpusFromPandas(
filtered_data,
category_col= source_title_col,
text_col= selected_column,
nlp=nlp,
).build()
# generate HTML visualization
html = sct.produce_scattertext_explorer(corpus,
category=first_source,
category_name=first_source,
not_category_name=second_source,
width_in_pixels=900,
minimum_term_frequency=0,
metadata=filtered_data)
st.components.v1.html(html, width=1000, height=600)
st.download_button('Download the plot', html)
return
# type_of_comparison 2
def comparison2(selected_column):
df[year_col] = pd.to_numeric(df[year_col], errors='coerce')
df.dropna(subset=[year_col], inplace=True)
df[year_col] = df[year_col].astype(int)
min_year = int(df[year_col].min())
max_year = int(df[year_col].max())
# layout row2
row2_col1, row2_col2 = st.columns(2)
with row2_col1:
first_range = st.slider("First range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))
with row2_col2:
second_range = st.slider("Second range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))
# filter data
first_range_filter_df = df[(df[year_col] >= first_range[0]) & (df[year_col] <= first_range[1])].copy()
first_range_filter_df['Topic Range'] = 'First range'
second_range_filter_df = df[(df[year_col] >= second_range[0]) & (df[year_col] <= second_range[1])].copy()
second_range_filter_df['Topic Range'] = 'Second range'
filtered_df = pd.concat([first_range_filter_df, second_range_filter_df])
if st.button("Generate the Scattertext Plot"):
# make plot
corpus = sct.CorpusFromPandas(
filtered_df,
category_col="Topic Range",
text_col= selected_column,
nlp=nlp,
).build()
# generate HTML visualization
html = sct.produce_scattertext_explorer(corpus,
category='First range',
category_name='First range',
not_category_name='Second range',
width_in_pixels=900,
minimum_term_frequency=0,
metadata=filtered_df)
st.components.v1.html(html, width=1000, height=600)
st.download_button('Download the plot', html)
return
if __name__ == '__main__':
uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt"])
if uploaded_file is not None:
# determine file type
if uploaded_file.name.endswith(".csv"):
df = pd.read_csv(uploaded_file)
source_title_col = find_column_name(df, 'Source Title')
abstract_col = find_column_name(df, 'Abstract')
title_col = find_column_name(df, 'Title')
year_col = find_column_name(df, 'Year')
# preview the uploaded file
elif uploaded_file.name.endswith(".txt"):
df = pd.read_table(uploaded_file, sep='\t') # Doc: assume contents are seperated by Tabs.
abstract_col = 'AB'
title_col = 'TI'
source_title_col = 'SO'
year_col = 'PY'
# preview the uploaded file
else:
st.error("Unsupported file format.")
st.stop()
column_choices = (abstract_col, title_col)
row1_col1, row1_col2 = st.columns(2)
with row1_col1:
choice = st.selectbox("Choose column to analyze", column_choices)
with row1_col2:
comparison_options = ('Sources', 'Years')
type_of_comparison = st.selectbox("Type of comparison", comparison_options)
if type_of_comparison == 'Sources':
compatison1(choice)
if type_of_comparison == 'Years':
comparison2(choice)
|