import streamlit as st from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline import spacy from tika import parser import requests import pandas as pd # Loading spaCy model outside the streamlit cache nlp = spacy.load("en_core_web_sm") @st.cache_resource() def load_environmental_model(): name_env = "ESGBERT/EnvironmentalBERT-environmental" tokenizer_env = AutoTokenizer.from_pretrained(name_env) model_env = AutoModelForSequenceClassification.from_pretrained(name_env) return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env) @st.cache_resource() def load_social_model(): name_soc = "ESGBERT/SocialBERT-social" tokenizer_soc = AutoTokenizer.from_pretrained(name_soc) model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc) return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc) @st.cache_resource() def load_governance_model(): name_gov = "ESGBERT/GovernanceBERT-governance" tokenizer_gov = AutoTokenizer.from_pretrained(name_gov) model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov) return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov) @st.cache_resource() def load_sentiment_model(): model_name = "climatebert/distilroberta-base-climate-sentiment" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512) return pipeline("text-classification", model=model, tokenizer=tokenizer) # Streamlit App st.title("ESG Report Classification using Natural Language Processing") # Get report URL from user input url = st.text_input("Enter the URL of the report (PDF):") # Model selection dropdown st.write("Environmental Model, Social Model, Governance Model would give the percentage denoting the parameter chosen.") st.write("Sentiment Model shows if the company is a risk or opportunity based on all 3 parameters.") selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"]) if url: # Download PDF content from the URL response = requests.get(url, stream=True) if response.status_code == 200: # Parse PDF and extract text raw_text = parser.from_buffer(response.content)['content'] # Extract sentences using spaCy doc = nlp(raw_text) sentences = [sent.text for sent in doc.sents] # Filtering and preprocessing sentences sequences = list(map(str, sentences)) sentences = [x.replace("\n", "") for x in sequences] sentences = [x for x in sentences if x != ""] sentences = [x for x in sentences if x[0].isupper()] sub_sentences = sentences[:100] # Classification using different models based on user selection if selected_model == "Environmental Model": pipe_model = load_environmental_model() elif selected_model == "Social Model": pipe_model = load_social_model() elif selected_model == "Governance Model": pipe_model = load_governance_model() else: pipe_model = load_sentiment_model() # Get predictions for the selected model model_results = pipe_model(sub_sentences, padding=True, truncation=True) model_labels = [x["label"] for x in model_results] # Display count of sentences labeled as the selected model st.subheader(f"{selected_model} Sentences Count") st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count()) else: st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.")