scattertext / test_function.py
Oliviayc's picture
change the naming method of HTML file.
17374a5
import scattertext as st
import spacy
import pandas as pd
import en_core_web_md
# load language model:
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")
def customized_file(file_path: str,column_category: str, column_text: str, subcategory1: str, subcategory2: str):
'''
generate plot from user selected file
:param file_path: the path of file to be analysis. it should be related path as the file or file directory should be in the same father directory of the script
:param column_category: header of the subcategories
:param column_text: header of text to be plotted
:param subcategory1: the subcategory displayed on X-axis
:param subcategory2: the subcategory displayed on Y-axis
:return: the HTML path of scattertext plot
'''
# proceed data
if file_path.endswith(".csv"):
df = pd.read_csv(file_path)
elif file_path.endswith(".txt"):
df = pd.read_table(file_path, sep='\t') # Doc: assume contents are seperated by Tabs.
else:
raise ValueError("Unsupported file format.")
# filter Dataframe with target subcategories
df_filtered = df[df[column_category].isin([subcategory1, subcategory2])]
if df_filtered.empty:
raise ValueError("This contect is empty. Check again")
# convert to scattertext corpus
corpus = st.CorpusFromPandas(df_filtered, category_col=column_category, text_col=column_text, nlp=nlp).build()
# create visualization
html = st.produce_scattertext_explorer(corpus,
category = subcategory1,
category_name = subcategory1,
not_category_name = subcategory2,
width_in_pixels = 1000,
minimum_term_frequency = 0,
metadata = df_filtered[column_category])
html_file_path = f"scattertext_{subcategory1}_{subcategory2}.html"
with open(html_file_path, "w", encoding='utf-8') as f:
f.write(html)
return html_file_path
test1 = customized_file('./sample_data/lens.csv', 'Lens ID','Abstract', '032-211-407-789-770', '036-842-302-145-799' )
test2 = customized_file('./sample_data/wos.txt', 'UT', 'AB','WOS:000685648800006','WOS:000448455800001')