Spaces:

Oliviayc
/

scattertext

Runtime error

App Files Files Community

scattertext / app.py

Oliviayc

try to debug dynamically determined column name

db25a2e 11 months ago

raw

history blame contribute delete

5.7 kB

	import scattertext as sct
	import spacy
	import pandas as pd
	import en_core_web_md
	import streamlit as st


	# load language model
	nlp = en_core_web_md.load()
	nlp = spacy.load("en_core_web_md")

	# Scopus file loading
	st.title("Scattertext Analysis")
	st.header("Put your file here... ")

	def find_column_name(df, column_hint):
	"""
	Searches through the DataFrame columns to find a match for a hint provided,
	ignoring case and whitespace differences.
	"""
	column_hint = column_hint.lower().replace(' ', '')
	for col in df.columns:
	if col.lower().replace(' ', '') == column_hint:
	return col
	raise ValueError(f"Column matching '{column_hint}' not found.")

	def compatison1(selected_column):
	# type_of_comparison 1
	row2_col1, row2_col2 = st.columns(2)
	with row2_col1:
	first_source = st.selectbox("Choose First Source", df[source_title_col].unique(), key='first_source_select')
	with row2_col2:
	second_source = st.selectbox("Choose Second Source", df[source_title_col].unique(),
	key='second_source_select')

	# filter data
	first_data = df[df[source_title_col] == first_source].copy()
	second_data = df[df[source_title_col] == second_source].copy()
	filtered_data = pd.concat([first_data, second_data])

	if st.button("Generate the Scattertext Plot"):
	# make plot
	corpus = sct.CorpusFromPandas(
	filtered_data,
	category_col= source_title_col,
	text_col= selected_column,
	nlp=nlp,
	).build()
	# generate HTML visualization
	html = sct.produce_scattertext_explorer(corpus,
	category=first_source,
	category_name=first_source,
	not_category_name=second_source,
	width_in_pixels=900,
	minimum_term_frequency=0,
	metadata=filtered_data)
	st.components.v1.html(html, width=1000, height=600)
	st.download_button('Download the plot', html)
	return


	# type_of_comparison 2
	def comparison2(selected_column):
	df[year_col] = pd.to_numeric(df[year_col], errors='coerce')
	df.dropna(subset=[year_col], inplace=True)
	df[year_col] = df[year_col].astype(int)

	min_year = int(df[year_col].min())
	max_year = int(df[year_col].max())
	# layout row2
	row2_col1, row2_col2 = st.columns(2)
	with row2_col1:
	first_range = st.slider("First range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))
	with row2_col2:
	second_range = st.slider("Second range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))

	# filter data
	first_range_filter_df = df[(df[year_col] >= first_range[0]) & (df[year_col] <= first_range[1])].copy()
	first_range_filter_df['Topic Range'] = 'First range'

	second_range_filter_df = df[(df[year_col] >= second_range[0]) & (df[year_col] <= second_range[1])].copy()
	second_range_filter_df['Topic Range'] = 'Second range'

	filtered_df = pd.concat([first_range_filter_df, second_range_filter_df])

	if st.button("Generate the Scattertext Plot"):
	# make plot
	corpus = sct.CorpusFromPandas(
	filtered_df,
	category_col="Topic Range",
	text_col= selected_column,
	nlp=nlp,
	).build()
	# generate HTML visualization
	html = sct.produce_scattertext_explorer(corpus,
	category='First range',
	category_name='First range',
	not_category_name='Second range',
	width_in_pixels=900,
	minimum_term_frequency=0,
	metadata=filtered_df)
	st.components.v1.html(html, width=1000, height=600)
	st.download_button('Download the plot', html)
	return


	if __name__ == '__main__':
	uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt"])
	if uploaded_file is not None:
	# determine file type
	if uploaded_file.name.endswith(".csv"):
	df = pd.read_csv(uploaded_file)
	source_title_col = find_column_name(df, 'Source Title')
	abstract_col = find_column_name(df, 'Abstract')
	title_col = find_column_name(df, 'Title')
	year_col = find_column_name(df, 'Year')
	# preview the uploaded file
	elif uploaded_file.name.endswith(".txt"):
	df = pd.read_table(uploaded_file, sep='\t') # Doc: assume contents are seperated by Tabs.
	abstract_col = 'AB'
	title_col = 'TI'
	source_title_col = 'SO'
	year_col = 'PY'
	# preview the uploaded file
	else:
	st.error("Unsupported file format.")
	st.stop()

	column_choices = (abstract_col, title_col)

	row1_col1, row1_col2 = st.columns(2)
	with row1_col1:
	choice = st.selectbox("Choose column to analyze", column_choices)
	with row1_col2:
	comparison_options = ('Sources', 'Years')
	type_of_comparison = st.selectbox("Type of comparison", comparison_options)
	if type_of_comparison == 'Sources':
	compatison1(choice)
	if type_of_comparison == 'Years':
	comparison2(choice)