Spaces:

SammyGasana
/

artemis-analysis

Sleeping

File size: 13,825 Bytes

import openai
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from dotenv import load_dotenv
import os
import time
import glob
from audio_predictions import AudioTranslation


API_KEY = os.getenv('OPENAI_API_KEY')
if API_KEY:
    openai.api_key = API_KEY
else:
    print("No API key provided. Please set the OPENAI_API_KEY environment variable.")
    # Implement fallback behavior or exit gracefully

# Set the API key for OpenAI
# openai.api_key = API_KEY
dataset_path = 'updated_company_tweets.csv'

@st.cache_data
def load_data():
    main_sentiment_df = pd.read_csv('main_sentiment_df.csv')
    kinya_df = pd.read_csv('kinya.csv')
    return pd.merge(main_sentiment_df, kinya_df, on='tweet_id', how='left')
def list_audio_files(directory):
    return glob.glob(os.path.join(directory, '*.mp3'))
# Function to display audio player widgets
def display_audio_players(audio_files, column):
    for file in audio_files:
        with column:
            st.audio(file)
            st.text(os.path.basename(file))
def process_audio_files(directories):
    audio_translator = AudioTranslation()
    results = []
    for directory in directories:
        audio_files = list_audio_files(f"{directory}/")
        for file_path in audio_files:
            transc = audio_translator.transcribe_audio(file_path)
            print(file_path)
            print('transcription')
            print(transc)
            translation_result = audio_translator.translate_sentence("rw", "en", "MULTI-rw-en", "", transc)
            translation_text = translation_result #.get('translatedText') if translation_result else "Translation Failed"

            results.append({
                "filename": os.path.basename(file_path),
                "company": directory,
                "transcription": transc,
                "translation": translation_text["translation"]
            })

    results_df = pd.DataFrame(results)
    return results_df
def audio_analysis_page():
    st.header("Audio Analysis")

    # Display audio files in columns
    col1, col2, col3 = st.columns(3)
    with col1:
        st.subheader("MTN")
        mtn_files = list_audio_files("mtn/")
        display_audio_players(mtn_files, col1)

    with col2:
        st.subheader("Liquid")
        liquid_files = list_audio_files("liquid/")
        display_audio_players(liquid_files, col2)

    with col3:
        st.subheader("Irembo")
        irembo_files = list_audio_files("irembo/")
        display_audio_players(irembo_files, col3)

    # Process button (functionality to be defined)
    if st.button("Process"):
        #st.write("Process function not yet implemented")
        results_df = process_audio_files(["mtn", "liquid", "irembo"])
        st.dataframe(results_df)
        # Process dataset for each company and display visualizations
        for company in ["mtn", "liquid", "irembo"]:
            st.write(f"Company: {company.upper()}")
            company_data = process_dataset_for_audio(results_df, company)
            display_audio_visualizations(company_data)
def display_audio_visualizations(company_data):
    col1, col2 = st.columns(2)
    with col1:
        st.write("Sentiment Distribution")
        pie_chart = generate_audiopie(company_data)
        st.pyplot(pie_chart)
    with col2:
        st.write("Word Cloud for Translations")
        word_cloud = generate_audioword_cloud(company_data)
        st.pyplot(word_cloud)
def generate_audiopie(data):
    start_time = time.time()
    # Filter data for the selected company
    company_data = data#[data['company_id'] == selected_company]
    sentiment_counts = company_data['sentiment_score'].value_counts()
    # Define colors for different sentiments
    colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'}
    pie_colors = [colors.get(sentiment) for sentiment in sentiment_counts.index]

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=pie_colors)
    ax.axis('equal')  # Keeps the pie chart circular
    end_time = time.time()
    print(f'Pie chart execution time: {end_time - start_time} seconds')
    return fig
def generate_audioword_cloud(data):
    start_time = time.time()
    # Filter data for the selected company
    company_data = data#[data['company_id'] == selected_company]
    # Choose the appropriate text column based on the selected company's data
    text_column = 'transcription'
    text_data = ' '.join(company_data[text_column].dropna())
    wordcloud = WordCloud(width=1000, height=600, background_color='white').generate(text_data)
    fig, ax = plt.subplots()
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    end_time = time.time()
    print(f'Word cloud execution time: {end_time - start_time} seconds')
    return fig
def display_company_visualizations(company_data):
    col1, col2 = st.columns(2)
    with col1:
        st.write("Sentiment Distribution")
        pie_chart = generate_pie_chart(company_data, company_data['company'].iloc[0])
        st.pyplot(pie_chart)
    with col2:
        st.write("Word Cloud for Translations")
        word_cloud = generate_word_cloud(company_data, company_data['translation'].iloc[0])
        st.pyplot(word_cloud)

def analyze_sentiment(texts):
    """Analyze the sentiment of a batch of texts using the OpenAI API."""
    try:
        responses = []
        for text in texts:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a sentiment analysis model."},
                    {"role": "user", "content": text}
                ]
            )
            sentiment_response = response.choices[0].message['content']
            if "positive" in sentiment_response.lower():
                responses.append("Positive sentiment")
            elif "negative" in sentiment_response.lower():
                responses.append("Negative sentiment")
            else:
                responses.append('Neutral')
        return responses
    except Exception as e:
        print(f"An error occurred: {e}")
        return ["Error"] * len(texts)

@st.cache_data()
def process_dataset(data):
    start_time = time.time()
    text_column = 'text' #if 'translated_kinyarwanda_manual' in data.columns and \
                          #                           data['translated_kinyarwanda_manual'].notna().any() else 'text'
    texts = data[text_column].tolist()
    data['sentiment_score'] = analyze_sentiment(texts)
    end_time = time.time()
    print(f'process dataset execution time : {end_time - start_time} seconds')
    data.to_csv('predictions.csv')
    return data

def generate_pie_chart(data, selected_company):
    start_time = time.time()
    # Filter data for the selected company
    company_data = data[data['company_id'] == selected_company]
    sentiment_counts = company_data['sentiment_score'].value_counts()
    colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'}
    pie_colors = [colors.get(sentiment) for sentiment in sentiment_counts.index]

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=pie_colors)
    ax.axis('equal')  # Keeps the pie chart circular
    end_time = time.time()
    print(f'Pie chart execution time: {end_time - start_time} seconds')
    return fig

def generate_word_cloud(data, selected_company):
    start_time = time.time()
    # Filter data for the selected company
    company_data = data[data['company_id'] == selected_company]
    # Choose the appropriate text column based on the selected company's data
    if 'translated_kinyarwanda_manual' in company_data.columns and company_data['translated_kinyarwanda_manual'].notna().any():
        text_column = 'translated_kinyarwanda_manual'
    else:
        text_column = 'text'
    text_data = ' '.join(company_data[text_column].dropna())
    wordcloud = WordCloud(width=1000, height=600, background_color='white').generate(text_data)
    fig, ax = plt.subplots()
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    end_time = time.time()
    print(f'Word cloud execution time: {end_time - start_time} seconds')
    return fig

def generate_time_series_chart(data, selected_company):
    start_time = time.time()
    # Filter data for the selected company
    company_data = data[data['company_id'] == selected_company]
    company_data['date'] = pd.to_datetime(company_data['date'])
    company_data.sort_values('date', inplace=True)
    grouped = company_data.groupby([company_data['date'].dt.date, 'sentiment_score']).size().unstack().fillna(0)
    # Define colors for different sentiments
    colors = {'Positive sentiment': 'green', 'Negative sentiment': 'red', 'Neutral': 'blue'}

    fig, ax = plt.subplots(figsize=(10, 6))
    # Plot each sentiment score with its corresponding color
    for sentiment in grouped.columns:
        ax.plot(grouped.index, grouped[sentiment], label=sentiment, color=colors.get(sentiment, 'black'))

    # grouped.plot(kind='line', ax=ax)
    ax.set_title('Sentiment Over Time')
    ax.set_xlabel('Date')
    ax.set_ylabel('Count')
    ax.legend()
    end_time = time.time()
    print(f'Time series chart execution time: {end_time - start_time} seconds')
    return fig

@st.cache_data()
def process_dataset_for_company(company_data):
    start_time = time.time()
    # Determine the column to analyze based on 'translated_kinyarwanda_manual' availability
    analyze_column = 'english' if 'translated_kinyarwanda_manual' in company_data.columns and \
                                 company_data['translated_kinyarwanda_manual'].notna().any() else 'text'
    texts = company_data[analyze_column].tolist()
    company_data['sentiment_score'] = analyze_sentiment(texts)
    end_time = time.time()
    print(f'process_dataset_for_company execution time: {end_time - start_time} seconds')
    return company_data
@st.cache_data()
def process_dataset_for_audio(company_data, company):
    result = company_data[company_data.company==company]
    start_time = time.time()
    # Determine the column to analyze based on 'translated_kinyarwanda_manual' availability
    analyze_column = 'translation'
    texts = result[analyze_column].tolist()
    result['sentiment_score'] = analyze_sentiment(texts)
    end_time = time.time()
    print(f'process_dataset_for_company execution time: {end_time - start_time} seconds')
    return result

def display_charts(data, selected_company):
    col1, col2 = st.columns(2)
    with col1:
        st.write("Sentiment Distribution")
        pie_chart = generate_pie_chart(data, selected_company)
        st.pyplot(pie_chart)
    with col2:
        st.write("Word Cloud for Text")
        word_cloud = generate_word_cloud(data, selected_company)
        st.pyplot(word_cloud)
    st.write('Sentiment Trend Over Time')
    time_series_chart = generate_time_series_chart(data, selected_company)
    st.pyplot(time_series_chart)

def display_sampled_data(data):
    sampled_data = pd.DataFrame()
    for company in data['company_id'].unique():
        company_data = data[data['company_id'] == company]
        unique_profiles = company_data.drop_duplicates(subset='profile_name')
        sampled_company_data = unique_profiles.sample(n=min(5, len(unique_profiles)), replace=False)
        if 'translated_kinyarwanda_manual' in company_data.columns and company_data['translated_kinyarwanda_manual'].notna().any():
            sampled_company_data['text'] = sampled_company_data['translated_kinyarwanda_manual']
        sampled_data = pd.concat([sampled_data, sampled_company_data], ignore_index=True)
    columns_to_display = ['tweet_id', 'company_id', 'user_id', 'profile_name', 'text', 'date']
    st.dataframe(sampled_data[columns_to_display])

def run_online_mode(data):
    company_list = data['company_id'].unique()
    selected_company = st.selectbox('Select a Company', company_list)
    if selected_company:
        company_data = data[data['company_id'] == selected_company]
        st.write(f'Sample of the collected data for {selected_company}')
        st.dataframe(company_data.head(10))
        processed_data = process_dataset_for_company(company_data)
        display_charts(processed_data, selected_company)

def run_batch_processing_mode():
    if os.path.exists('predictions.csv'):
        processed_data = pd.read_csv('predictions.csv')
    else:
        data = load_data()
        processed_data = process_dataset(data)
        processed_data.to_csv('predictions.csv', index=False)
    company_list = processed_data['company_id'].unique()
    selected_company = st.selectbox('Select a Company', company_list)
    if selected_company:
        company_data = processed_data[processed_data['company_id'] == selected_company]
        st.write(f'Sample of the collected data for {selected_company}')
        st.dataframe(company_data.head(10))
        display_charts(company_data, selected_company)
def sentiment_analysis_page():
    st.title('Company Sentiment Analysis')
    processing_mode = st.selectbox("Choose Processing Mode", ["Batch Processing", "Online"])
    data = load_data()
    display_sampled_data(data)
    if processing_mode == "Online":
        run_online_mode(data)
    else:
        run_batch_processing_mode()

def main():
    st.sidebar.title('Navigation')
    page = st.sidebar.radio("Select a Page", ["Sentiment Analysis", "Audio Analysis"])
    if page == "Sentiment Analysis":
        sentiment_analysis_page()
    elif page =="Audio Analysis":
        audio_analysis_page()

if __name__ == "__main__":
    main()