File size: 2,365 Bytes
1d183ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69c06f1
 
1d183ef
 
 
 
 
 
 
 
 
69c06f1
1d183ef
17374a5
1d183ef
 
 
 
 
17374a5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import scattertext as st
import spacy
import pandas as pd
import en_core_web_md



# load language model:
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")

def customized_file(file_path: str,column_category: str, column_text: str, subcategory1: str, subcategory2: str):
    '''
    generate plot from user selected file
    :param file_path: the path of file to be analysis. it should be related path as the file or file directory should be in the same father directory of the script
    :param column_category: header of the subcategories
    :param column_text: header of text to be plotted
    :param subcategory1: the subcategory displayed on X-axis
    :param subcategory2: the subcategory displayed on Y-axis
    :return: the HTML path of scattertext plot
    '''
    # proceed data
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    elif file_path.endswith(".txt"):
        df = pd.read_table(file_path, sep='\t')  # Doc: assume contents are seperated by Tabs.
    else:
        raise ValueError("Unsupported file format.")

    # filter Dataframe with target subcategories
    df_filtered = df[df[column_category].isin([subcategory1, subcategory2])]
    if df_filtered.empty:
        raise ValueError("This contect is empty. Check again")
    # convert to scattertext corpus
    corpus = st.CorpusFromPandas(df_filtered, category_col=column_category, text_col=column_text, nlp=nlp).build()
    # create visualization
    html = st.produce_scattertext_explorer(corpus,
                                            category = subcategory1,
                                            category_name = subcategory1,
                                            not_category_name = subcategory2,
                                            width_in_pixels = 1000,
                                            minimum_term_frequency = 0,
                                            metadata = df_filtered[column_category])

    html_file_path = f"scattertext_{subcategory1}_{subcategory2}.html"
    with open(html_file_path, "w", encoding='utf-8') as f:
        f.write(html)

    return html_file_path

test1 = customized_file('./sample_data/lens.csv', 'Lens ID','Abstract', '032-211-407-789-770', '036-842-302-145-799' )
test2 = customized_file('./sample_data/wos.txt', 'UT', 'AB','WOS:000685648800006','WOS:000448455800001')