Spaces:
Runtime error
Runtime error
import scattertext as st | |
import spacy | |
import pandas as pd | |
import en_core_web_md | |
# load language model: | |
nlp = en_core_web_md.load() | |
nlp = spacy.load("en_core_web_md") | |
def customized_file(file_path: str,column_category: str, column_text: str, subcategory1: str, subcategory2: str): | |
''' | |
generate plot from user selected file | |
:param file_path: the path of file to be analysis. it should be related path as the file or file directory should be in the same father directory of the script | |
:param column_category: header of the subcategories | |
:param column_text: header of text to be plotted | |
:param subcategory1: the subcategory displayed on X-axis | |
:param subcategory2: the subcategory displayed on Y-axis | |
:return: the HTML path of scattertext plot | |
''' | |
# proceed data | |
if file_path.endswith(".csv"): | |
df = pd.read_csv(file_path) | |
elif file_path.endswith(".txt"): | |
df = pd.read_table(file_path, sep='\t') # Doc: assume contents are seperated by Tabs. | |
else: | |
raise ValueError("Unsupported file format.") | |
# filter Dataframe with target subcategories | |
df_filtered = df[df[column_category].isin([subcategory1, subcategory2])] | |
if df_filtered.empty: | |
raise ValueError("This contect is empty. Check again") | |
# convert to scattertext corpus | |
corpus = st.CorpusFromPandas(df_filtered, category_col=column_category, text_col=column_text, nlp=nlp).build() | |
# create visualization | |
html = st.produce_scattertext_explorer(corpus, | |
category = subcategory1, | |
category_name = subcategory1, | |
not_category_name = subcategory2, | |
width_in_pixels = 1000, | |
minimum_term_frequency = 0, | |
metadata = df_filtered[column_category]) | |
html_file_path = f"scattertext_{subcategory1}_{subcategory2}.html" | |
with open(html_file_path, "w", encoding='utf-8') as f: | |
f.write(html) | |
return html_file_path | |
test1 = customized_file('./sample_data/lens.csv', 'Lens ID','Abstract', '032-211-407-789-770', '036-842-302-145-799' ) | |
test2 = customized_file('./sample_data/wos.txt', 'UT', 'AB','WOS:000685648800006','WOS:000448455800001') | |