import scattertext as st import spacy import pandas as pd import en_core_web_md # load language model: nlp = en_core_web_md.load() nlp = spacy.load("en_core_web_md") def customized_file(file_path: str,column_category: str, column_text: str, subcategory1: str, subcategory2: str): ''' generate plot from user selected file :param file_path: the path of file to be analysis. it should be related path as the file or file directory should be in the same father directory of the script :param column_category: header of the subcategories :param column_text: header of text to be plotted :param subcategory1: the subcategory displayed on X-axis :param subcategory2: the subcategory displayed on Y-axis :return: the HTML path of scattertext plot ''' # proceed data if file_path.endswith(".csv"): df = pd.read_csv(file_path) elif file_path.endswith(".txt"): df = pd.read_table(file_path, sep='\t') # Doc: assume contents are seperated by Tabs. else: raise ValueError("Unsupported file format.") # filter Dataframe with target subcategories df_filtered = df[df[column_category].isin([subcategory1, subcategory2])] if df_filtered.empty: raise ValueError("This contect is empty. Check again") # convert to scattertext corpus corpus = st.CorpusFromPandas(df_filtered, category_col=column_category, text_col=column_text, nlp=nlp).build() # create visualization html = st.produce_scattertext_explorer(corpus, category = subcategory1, category_name = subcategory1, not_category_name = subcategory2, width_in_pixels = 1000, minimum_term_frequency = 0, metadata = df_filtered[column_category]) html_file_path = f"scattertext_{subcategory1}_{subcategory2}.html" with open(html_file_path, "w", encoding='utf-8') as f: f.write(html) return html_file_path test1 = customized_file('./sample_data/lens.csv', 'Lens ID','Abstract', '032-211-407-789-770', '036-842-302-145-799' ) test2 = customized_file('./sample_data/wos.txt', 'UT', 'AB','WOS:000685648800006','WOS:000448455800001')