shrut27 commited on
Commit
c4426e9
1 Parent(s): 2d2c5f7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -0
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
+ import spacy
4
+ from tika import parser
5
+ import requests
6
+ import pandas as pd
7
+
8
+ # Loading spaCy model outside the streamlit cache
9
+ nlp = spacy.load("en_core_web_sm")
10
+
11
+ @st.cache(allow_output_mutation=True)
12
+ def load_environmental_model():
13
+ name_env = "ESGBERT/EnvironmentalBERT-environmental"
14
+ tokenizer_env = AutoTokenizer.from_pretrained(name_env)
15
+ model_env = AutoModelForSequenceClassification.from_pretrained(name_env)
16
+ return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env)
17
+
18
+ @st.cache(allow_output_mutation=True)
19
+ def load_social_model():
20
+ name_soc = "ESGBERT/SocialBERT-social"
21
+ tokenizer_soc = AutoTokenizer.from_pretrained(name_soc)
22
+ model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc)
23
+ return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc)
24
+
25
+ @st.cache(allow_output_mutation=True)
26
+ def load_governance_model():
27
+ name_gov = "ESGBERT/GovernanceBERT-governance"
28
+ tokenizer_gov = AutoTokenizer.from_pretrained(name_gov)
29
+ model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov)
30
+ return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov)
31
+
32
+ @st.cache(allow_output_mutation=True)
33
+ def load_sentiment_model():
34
+ model_name = "climatebert/distilroberta-base-climate-sentiment"
35
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
37
+ return pipeline("text-classification", model=model, tokenizer=tokenizer)
38
+
39
+ # Streamlit App
40
+ st.title("ESGBERT Text Classification App")
41
+
42
+ # Get report URL from user input
43
+ url = st.text_input("Enter the URL of the report (PDF):")
44
+
45
+ # Model selection dropdown
46
+ selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"])
47
+
48
+ if url:
49
+ # Download PDF content from the URL
50
+ response = requests.get(url, stream=True)
51
+
52
+ if response.status_code == 200:
53
+ # Parse PDF and extract text
54
+ raw_text = parser.from_buffer(response.content)['content']
55
+
56
+ # Extract sentences using spaCy
57
+ doc = nlp(raw_text)
58
+ sentences = [sent.text for sent in doc.sents]
59
+
60
+ # Filtering and preprocessing sentences
61
+ sequences = list(map(str, sentences))
62
+ sentences = [x.replace("\n", "") for x in sequences]
63
+ sentences = [x for x in sentences if x != ""]
64
+ sentences = [x for x in sentences if x[0].isupper()]
65
+ sub_sentences = sentences[:100] # Takes around 20 seconds
66
+
67
+ # Classification using different models based on user selection
68
+ if selected_model == "Environmental Model":
69
+ pipe_model = load_environmental_model()
70
+ elif selected_model == "Social Model":
71
+ pipe_model = load_social_model()
72
+ elif selected_model == "Governance Model":
73
+ pipe_model = load_governance_model()
74
+ else:
75
+ pipe_model = load_sentiment_model()
76
+
77
+ # Get predictions for the selected model
78
+ model_results = pipe_model(sub_sentences, padding=True, truncation=True)
79
+ model_labels = [x["label"] for x in model_results]
80
+
81
+ # Display count of sentences labeled as the selected model
82
+ st.subheader(f"{selected_model} Sentences Count")
83
+ st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count())
84
+
85
+ else:
86
+ st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.")