artemis-analysis / main.py
SammyGasana's picture
Update main.py
80e9da7 verified
import openai
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from dotenv import load_dotenv
import os
import time
import glob
from audio_predictions import AudioTranslation
API_KEY = os.getenv('OPENAI_API_KEY')
if API_KEY:
openai.api_key = API_KEY
else:
print("No API key provided. Please set the OPENAI_API_KEY environment variable.")
# Implement fallback behavior or exit gracefully
# Set the API key for OpenAI
# openai.api_key = API_KEY
dataset_path = 'updated_company_tweets.csv'
@st.cache_data
def load_data():
main_sentiment_df = pd.read_csv('main_sentiment_df.csv')
kinya_df = pd.read_csv('kinya.csv')
return pd.merge(main_sentiment_df, kinya_df, on='tweet_id', how='left')
def list_audio_files(directory):
return glob.glob(os.path.join(directory, '*.mp3'))
# Function to display audio player widgets
def display_audio_players(audio_files, column):
for file in audio_files:
with column:
st.audio(file)
st.text(os.path.basename(file))
def process_audio_files(directories):
audio_translator = AudioTranslation()
results = []
for directory in directories:
audio_files = list_audio_files(f"{directory}/")
for file_path in audio_files:
transc = audio_translator.transcribe_audio(file_path)
print(file_path)
print('transcription')
print(transc)
translation_result = audio_translator.translate_sentence("rw", "en", "MULTI-rw-en", "", transc)
translation_text = translation_result #.get('translatedText') if translation_result else "Translation Failed"
results.append({
"filename": os.path.basename(file_path),
"company": directory,
"transcription": transc,
"translation": translation_text["translation"]
})
results_df = pd.DataFrame(results)
return results_df
def audio_analysis_page():
st.header("Audio Analysis")
# Display audio files in columns
col1, col2, col3 = st.columns(3)
with col1:
st.subheader("MTN")
mtn_files = list_audio_files("mtn/")
display_audio_players(mtn_files, col1)
with col2:
st.subheader("Liquid")
liquid_files = list_audio_files("liquid/")
display_audio_players(liquid_files, col2)
with col3:
st.subheader("Irembo")
irembo_files = list_audio_files("irembo/")
display_audio_players(irembo_files, col3)
# Process button (functionality to be defined)
if st.button("Process"):
#st.write("Process function not yet implemented")
results_df = process_audio_files(["mtn", "liquid", "irembo"])
st.dataframe(results_df)
# Process dataset for each company and display visualizations
for company in ["mtn", "liquid", "irembo"]:
st.write(f"Company: {company.upper()}")
company_data = process_dataset_for_audio(results_df, company)
display_audio_visualizations(company_data)
def display_audio_visualizations(company_data):
col1, col2 = st.columns(2)
with col1:
st.write("Sentiment Distribution")
pie_chart = generate_audiopie(company_data)
st.pyplot(pie_chart)
with col2:
st.write("Word Cloud for Translations")
word_cloud = generate_audioword_cloud(company_data)
st.pyplot(word_cloud)
def generate_audiopie(data):
start_time = time.time()
# Filter data for the selected company
company_data = data#[data['company_id'] == selected_company]
sentiment_counts = company_data['sentiment_score'].value_counts()
# Define colors for different sentiments
colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'}
pie_colors = [colors.get(sentiment) for sentiment in sentiment_counts.index]
fig, ax = plt.subplots(figsize=(10, 6))
ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=pie_colors)
ax.axis('equal') # Keeps the pie chart circular
end_time = time.time()
print(f'Pie chart execution time: {end_time - start_time} seconds')
return fig
def generate_audioword_cloud(data):
start_time = time.time()
# Filter data for the selected company
company_data = data#[data['company_id'] == selected_company]
# Choose the appropriate text column based on the selected company's data
text_column = 'transcription'
text_data = ' '.join(company_data[text_column].dropna())
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate(text_data)
fig, ax = plt.subplots()
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
end_time = time.time()
print(f'Word cloud execution time: {end_time - start_time} seconds')
return fig
def display_company_visualizations(company_data):
col1, col2 = st.columns(2)
with col1:
st.write("Sentiment Distribution")
pie_chart = generate_pie_chart(company_data, company_data['company'].iloc[0])
st.pyplot(pie_chart)
with col2:
st.write("Word Cloud for Translations")
word_cloud = generate_word_cloud(company_data, company_data['translation'].iloc[0])
st.pyplot(word_cloud)
def analyze_sentiment(texts):
"""Analyze the sentiment of a batch of texts using the OpenAI API."""
try:
responses = []
for text in texts:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a sentiment analysis model."},
{"role": "user", "content": text}
]
)
sentiment_response = response.choices[0].message['content']
if "positive" in sentiment_response.lower():
responses.append("Positive sentiment")
elif "negative" in sentiment_response.lower():
responses.append("Negative sentiment")
else:
responses.append('Neutral')
return responses
except Exception as e:
print(f"An error occurred: {e}")
return ["Error"] * len(texts)
@st.cache_data()
def process_dataset(data):
start_time = time.time()
text_column = 'text' #if 'translated_kinyarwanda_manual' in data.columns and \
# data['translated_kinyarwanda_manual'].notna().any() else 'text'
texts = data[text_column].tolist()
data['sentiment_score'] = analyze_sentiment(texts)
end_time = time.time()
print(f'process dataset execution time : {end_time - start_time} seconds')
data.to_csv('predictions.csv')
return data
def generate_pie_chart(data, selected_company):
start_time = time.time()
# Filter data for the selected company
company_data = data[data['company_id'] == selected_company]
sentiment_counts = company_data['sentiment_score'].value_counts()
colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'}
pie_colors = [colors.get(sentiment) for sentiment in sentiment_counts.index]
fig, ax = plt.subplots(figsize=(10, 6))
ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=pie_colors)
ax.axis('equal') # Keeps the pie chart circular
end_time = time.time()
print(f'Pie chart execution time: {end_time - start_time} seconds')
return fig
def generate_word_cloud(data, selected_company):
start_time = time.time()
# Filter data for the selected company
company_data = data[data['company_id'] == selected_company]
# Choose the appropriate text column based on the selected company's data
if 'translated_kinyarwanda_manual' in company_data.columns and company_data['translated_kinyarwanda_manual'].notna().any():
text_column = 'translated_kinyarwanda_manual'
else:
text_column = 'text'
text_data = ' '.join(company_data[text_column].dropna())
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate(text_data)
fig, ax = plt.subplots()
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
end_time = time.time()
print(f'Word cloud execution time: {end_time - start_time} seconds')
return fig
def generate_time_series_chart(data, selected_company):
start_time = time.time()
# Filter data for the selected company
company_data = data[data['company_id'] == selected_company]
company_data['date'] = pd.to_datetime(company_data['date'])
company_data.sort_values('date', inplace=True)
grouped = company_data.groupby([company_data['date'].dt.date, 'sentiment_score']).size().unstack().fillna(0)
# Define colors for different sentiments
colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'}
fig, ax = plt.subplots(figsize=(10, 6))
# Plot each sentiment score with its corresponding color
for sentiment in grouped.columns:
ax.plot(grouped.index, grouped[sentiment], label=sentiment, color=colors.get(sentiment, 'black'))
# grouped.plot(kind='line', ax=ax)
ax.set_title('Sentiment Over Time')
ax.set_xlabel('Date')
ax.set_ylabel('Count')
ax.legend()
end_time = time.time()
print(f'Time series chart execution time: {end_time - start_time} seconds')
return fig
@st.cache_data()
def process_dataset_for_company(company_data):
start_time = time.time()
# Determine the column to analyze based on 'translated_kinyarwanda_manual' availability
analyze_column = 'english' if 'translated_kinyarwanda_manual' in company_data.columns and \
company_data['translated_kinyarwanda_manual'].notna().any() else 'text'
texts = company_data[analyze_column].tolist()
company_data['sentiment_score'] = analyze_sentiment(texts)
end_time = time.time()
print(f'process_dataset_for_company execution time: {end_time - start_time} seconds')
return company_data
@st.cache_data()
def process_dataset_for_audio(company_data, company):
result = company_data[company_data.company==company]
start_time = time.time()
# Determine the column to analyze based on 'translated_kinyarwanda_manual' availability
analyze_column = 'translation'
texts = result[analyze_column].tolist()
result['sentiment_score'] = analyze_sentiment(texts)
end_time = time.time()
print(f'process_dataset_for_company execution time: {end_time - start_time} seconds')
return result
def display_charts(data, selected_company):
col1, col2 = st.columns(2)
with col1:
st.write("Sentiment Distribution")
pie_chart = generate_pie_chart(data, selected_company)
st.pyplot(pie_chart)
with col2:
st.write("Word Cloud for Text")
word_cloud = generate_word_cloud(data, selected_company)
st.pyplot(word_cloud)
st.write('Sentiment Trend Over Time')
time_series_chart = generate_time_series_chart(data, selected_company)
st.pyplot(time_series_chart)
def display_sampled_data(data):
sampled_data = pd.DataFrame()
for company in data['company_id'].unique():
company_data = data[data['company_id'] == company]
unique_profiles = company_data.drop_duplicates(subset='profile_name')
sampled_company_data = unique_profiles.sample(n=min(5, len(unique_profiles)), replace=False)
if 'translated_kinyarwanda_manual' in company_data.columns and company_data['translated_kinyarwanda_manual'].notna().any():
sampled_company_data['text'] = sampled_company_data['translated_kinyarwanda_manual']
sampled_data = pd.concat([sampled_data, sampled_company_data], ignore_index=True)
columns_to_display = ['tweet_id', 'company_id', 'user_id', 'profile_name', 'text', 'date']
st.dataframe(sampled_data[columns_to_display])
def run_online_mode(data):
company_list = data['company_id'].unique()
selected_company = st.selectbox('Select a Company', company_list)
if selected_company:
company_data = data[data['company_id'] == selected_company]
st.write(f'Sample of the collected data for {selected_company}')
st.dataframe(company_data.head(10))
processed_data = process_dataset_for_company(company_data)
display_charts(processed_data, selected_company)
def run_batch_processing_mode():
if os.path.exists('predictions.csv'):
processed_data = pd.read_csv('predictions.csv')
else:
data = load_data()
processed_data = process_dataset(data)
processed_data.to_csv('predictions.csv', index=False)
company_list = processed_data['company_id'].unique()
selected_company = st.selectbox('Select a Company', company_list)
if selected_company:
company_data = processed_data[processed_data['company_id'] == selected_company]
st.write(f'Sample of the collected data for {selected_company}')
st.dataframe(company_data.head(10))
display_charts(company_data, selected_company)
def sentiment_analysis_page():
st.title('Company Sentiment Analysis')
processing_mode = st.selectbox("Choose Processing Mode", ["Batch Processing", "Online"])
data = load_data()
display_sampled_data(data)
if processing_mode == "Online":
run_online_mode(data)
else:
run_batch_processing_mode()
def main():
st.sidebar.title('Navigation')
page = st.sidebar.radio("Select a Page", ["Sentiment Analysis", "Audio Analysis"])
if page == "Sentiment Analysis":
sentiment_analysis_page()
elif page =="Audio Analysis":
audio_analysis_page()
if __name__ == "__main__":
main()