Spaces:
Sleeping
Sleeping
import openai | |
import pandas as pd | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
from dotenv import load_dotenv | |
import os | |
import time | |
import glob | |
from audio_predictions import AudioTranslation | |
API_KEY = os.getenv('OPENAI_API_KEY') | |
if API_KEY: | |
openai.api_key = API_KEY | |
else: | |
print("No API key provided. Please set the OPENAI_API_KEY environment variable.") | |
# Implement fallback behavior or exit gracefully | |
# Set the API key for OpenAI | |
# openai.api_key = API_KEY | |
dataset_path = 'updated_company_tweets.csv' | |
def load_data(): | |
main_sentiment_df = pd.read_csv('main_sentiment_df.csv') | |
kinya_df = pd.read_csv('kinya.csv') | |
return pd.merge(main_sentiment_df, kinya_df, on='tweet_id', how='left') | |
def list_audio_files(directory): | |
return glob.glob(os.path.join(directory, '*.mp3')) | |
# Function to display audio player widgets | |
def display_audio_players(audio_files, column): | |
for file in audio_files: | |
with column: | |
st.audio(file) | |
st.text(os.path.basename(file)) | |
def process_audio_files(directories): | |
audio_translator = AudioTranslation() | |
results = [] | |
for directory in directories: | |
audio_files = list_audio_files(f"{directory}/") | |
for file_path in audio_files: | |
transc = audio_translator.transcribe_audio(file_path) | |
print(file_path) | |
print('transcription') | |
print(transc) | |
translation_result = audio_translator.translate_sentence("rw", "en", "MULTI-rw-en", "", transc) | |
translation_text = translation_result #.get('translatedText') if translation_result else "Translation Failed" | |
results.append({ | |
"filename": os.path.basename(file_path), | |
"company": directory, | |
"transcription": transc, | |
"translation": translation_text["translation"] | |
}) | |
results_df = pd.DataFrame(results) | |
return results_df | |
def audio_analysis_page(): | |
st.header("Audio Analysis") | |
# Display audio files in columns | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.subheader("MTN") | |
mtn_files = list_audio_files("mtn/") | |
display_audio_players(mtn_files, col1) | |
with col2: | |
st.subheader("Liquid") | |
liquid_files = list_audio_files("liquid/") | |
display_audio_players(liquid_files, col2) | |
with col3: | |
st.subheader("Irembo") | |
irembo_files = list_audio_files("irembo/") | |
display_audio_players(irembo_files, col3) | |
# Process button (functionality to be defined) | |
if st.button("Process"): | |
#st.write("Process function not yet implemented") | |
results_df = process_audio_files(["mtn", "liquid", "irembo"]) | |
st.dataframe(results_df) | |
# Process dataset for each company and display visualizations | |
for company in ["mtn", "liquid", "irembo"]: | |
st.write(f"Company: {company.upper()}") | |
company_data = process_dataset_for_audio(results_df, company) | |
display_audio_visualizations(company_data) | |
def display_audio_visualizations(company_data): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("Sentiment Distribution") | |
pie_chart = generate_audiopie(company_data) | |
st.pyplot(pie_chart) | |
with col2: | |
st.write("Word Cloud for Translations") | |
word_cloud = generate_audioword_cloud(company_data) | |
st.pyplot(word_cloud) | |
def generate_audiopie(data): | |
start_time = time.time() | |
# Filter data for the selected company | |
company_data = data#[data['company_id'] == selected_company] | |
sentiment_counts = company_data['sentiment_score'].value_counts() | |
# Define colors for different sentiments | |
colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'} | |
pie_colors = [colors.get(sentiment) for sentiment in sentiment_counts.index] | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=pie_colors) | |
ax.axis('equal') # Keeps the pie chart circular | |
end_time = time.time() | |
print(f'Pie chart execution time: {end_time - start_time} seconds') | |
return fig | |
def generate_audioword_cloud(data): | |
start_time = time.time() | |
# Filter data for the selected company | |
company_data = data#[data['company_id'] == selected_company] | |
# Choose the appropriate text column based on the selected company's data | |
text_column = 'transcription' | |
text_data = ' '.join(company_data[text_column].dropna()) | |
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate(text_data) | |
fig, ax = plt.subplots() | |
ax.imshow(wordcloud, interpolation='bilinear') | |
ax.axis('off') | |
end_time = time.time() | |
print(f'Word cloud execution time: {end_time - start_time} seconds') | |
return fig | |
def display_company_visualizations(company_data): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("Sentiment Distribution") | |
pie_chart = generate_pie_chart(company_data, company_data['company'].iloc[0]) | |
st.pyplot(pie_chart) | |
with col2: | |
st.write("Word Cloud for Translations") | |
word_cloud = generate_word_cloud(company_data, company_data['translation'].iloc[0]) | |
st.pyplot(word_cloud) | |
def analyze_sentiment(texts): | |
"""Analyze the sentiment of a batch of texts using the OpenAI API.""" | |
try: | |
responses = [] | |
for text in texts: | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are a sentiment analysis model."}, | |
{"role": "user", "content": text} | |
] | |
) | |
sentiment_response = response.choices[0].message['content'] | |
if "positive" in sentiment_response.lower(): | |
responses.append("Positive sentiment") | |
elif "negative" in sentiment_response.lower(): | |
responses.append("Negative sentiment") | |
else: | |
responses.append('Neutral') | |
return responses | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return ["Error"] * len(texts) | |
def process_dataset(data): | |
start_time = time.time() | |
text_column = 'text' #if 'translated_kinyarwanda_manual' in data.columns and \ | |
# data['translated_kinyarwanda_manual'].notna().any() else 'text' | |
texts = data[text_column].tolist() | |
data['sentiment_score'] = analyze_sentiment(texts) | |
end_time = time.time() | |
print(f'process dataset execution time : {end_time - start_time} seconds') | |
data.to_csv('predictions.csv') | |
return data | |
def generate_pie_chart(data, selected_company): | |
start_time = time.time() | |
# Filter data for the selected company | |
company_data = data[data['company_id'] == selected_company] | |
sentiment_counts = company_data['sentiment_score'].value_counts() | |
colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'} | |
pie_colors = [colors.get(sentiment) for sentiment in sentiment_counts.index] | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=pie_colors) | |
ax.axis('equal') # Keeps the pie chart circular | |
end_time = time.time() | |
print(f'Pie chart execution time: {end_time - start_time} seconds') | |
return fig | |
def generate_word_cloud(data, selected_company): | |
start_time = time.time() | |
# Filter data for the selected company | |
company_data = data[data['company_id'] == selected_company] | |
# Choose the appropriate text column based on the selected company's data | |
if 'translated_kinyarwanda_manual' in company_data.columns and company_data['translated_kinyarwanda_manual'].notna().any(): | |
text_column = 'translated_kinyarwanda_manual' | |
else: | |
text_column = 'text' | |
text_data = ' '.join(company_data[text_column].dropna()) | |
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate(text_data) | |
fig, ax = plt.subplots() | |
ax.imshow(wordcloud, interpolation='bilinear') | |
ax.axis('off') | |
end_time = time.time() | |
print(f'Word cloud execution time: {end_time - start_time} seconds') | |
return fig | |
def generate_time_series_chart(data, selected_company): | |
start_time = time.time() | |
# Filter data for the selected company | |
company_data = data[data['company_id'] == selected_company] | |
company_data['date'] = pd.to_datetime(company_data['date']) | |
company_data.sort_values('date', inplace=True) | |
grouped = company_data.groupby([company_data['date'].dt.date, 'sentiment_score']).size().unstack().fillna(0) | |
# Define colors for different sentiments | |
colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'} | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
# Plot each sentiment score with its corresponding color | |
for sentiment in grouped.columns: | |
ax.plot(grouped.index, grouped[sentiment], label=sentiment, color=colors.get(sentiment, 'black')) | |
# grouped.plot(kind='line', ax=ax) | |
ax.set_title('Sentiment Over Time') | |
ax.set_xlabel('Date') | |
ax.set_ylabel('Count') | |
ax.legend() | |
end_time = time.time() | |
print(f'Time series chart execution time: {end_time - start_time} seconds') | |
return fig | |
def process_dataset_for_company(company_data): | |
start_time = time.time() | |
# Determine the column to analyze based on 'translated_kinyarwanda_manual' availability | |
analyze_column = 'english' if 'translated_kinyarwanda_manual' in company_data.columns and \ | |
company_data['translated_kinyarwanda_manual'].notna().any() else 'text' | |
texts = company_data[analyze_column].tolist() | |
company_data['sentiment_score'] = analyze_sentiment(texts) | |
end_time = time.time() | |
print(f'process_dataset_for_company execution time: {end_time - start_time} seconds') | |
return company_data | |
def process_dataset_for_audio(company_data, company): | |
result = company_data[company_data.company==company] | |
start_time = time.time() | |
# Determine the column to analyze based on 'translated_kinyarwanda_manual' availability | |
analyze_column = 'translation' | |
texts = result[analyze_column].tolist() | |
result['sentiment_score'] = analyze_sentiment(texts) | |
end_time = time.time() | |
print(f'process_dataset_for_company execution time: {end_time - start_time} seconds') | |
return result | |
def display_charts(data, selected_company): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("Sentiment Distribution") | |
pie_chart = generate_pie_chart(data, selected_company) | |
st.pyplot(pie_chart) | |
with col2: | |
st.write("Word Cloud for Text") | |
word_cloud = generate_word_cloud(data, selected_company) | |
st.pyplot(word_cloud) | |
st.write('Sentiment Trend Over Time') | |
time_series_chart = generate_time_series_chart(data, selected_company) | |
st.pyplot(time_series_chart) | |
def display_sampled_data(data): | |
sampled_data = pd.DataFrame() | |
for company in data['company_id'].unique(): | |
company_data = data[data['company_id'] == company] | |
unique_profiles = company_data.drop_duplicates(subset='profile_name') | |
sampled_company_data = unique_profiles.sample(n=min(5, len(unique_profiles)), replace=False) | |
if 'translated_kinyarwanda_manual' in company_data.columns and company_data['translated_kinyarwanda_manual'].notna().any(): | |
sampled_company_data['text'] = sampled_company_data['translated_kinyarwanda_manual'] | |
sampled_data = pd.concat([sampled_data, sampled_company_data], ignore_index=True) | |
columns_to_display = ['tweet_id', 'company_id', 'user_id', 'profile_name', 'text', 'date'] | |
st.dataframe(sampled_data[columns_to_display]) | |
def run_online_mode(data): | |
company_list = data['company_id'].unique() | |
selected_company = st.selectbox('Select a Company', company_list) | |
if selected_company: | |
company_data = data[data['company_id'] == selected_company] | |
st.write(f'Sample of the collected data for {selected_company}') | |
st.dataframe(company_data.head(10)) | |
processed_data = process_dataset_for_company(company_data) | |
display_charts(processed_data, selected_company) | |
def run_batch_processing_mode(): | |
if os.path.exists('predictions.csv'): | |
processed_data = pd.read_csv('predictions.csv') | |
else: | |
data = load_data() | |
processed_data = process_dataset(data) | |
processed_data.to_csv('predictions.csv', index=False) | |
company_list = processed_data['company_id'].unique() | |
selected_company = st.selectbox('Select a Company', company_list) | |
if selected_company: | |
company_data = processed_data[processed_data['company_id'] == selected_company] | |
st.write(f'Sample of the collected data for {selected_company}') | |
st.dataframe(company_data.head(10)) | |
display_charts(company_data, selected_company) | |
def sentiment_analysis_page(): | |
st.title('Company Sentiment Analysis') | |
processing_mode = st.selectbox("Choose Processing Mode", ["Batch Processing", "Online"]) | |
data = load_data() | |
display_sampled_data(data) | |
if processing_mode == "Online": | |
run_online_mode(data) | |
else: | |
run_batch_processing_mode() | |
def main(): | |
st.sidebar.title('Navigation') | |
page = st.sidebar.radio("Select a Page", ["Sentiment Analysis", "Audio Analysis"]) | |
if page == "Sentiment Analysis": | |
sentiment_analysis_page() | |
elif page =="Audio Analysis": | |
audio_analysis_page() | |
if __name__ == "__main__": | |
main() | |