Spaces:

Artemis-IA
/

3-stars-sentiment-analysis

Runtime error

File size: 4,424 Bytes

3daec5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f592e0b
3daec5b

import streamlit as st
import pandas as pd
import numpy as np
import joblib
import spacy
from wordcloud import WordCloud
from io import StringIO, BytesIO
import mimetypes
from transformers import CamembertForSequenceClassification, CamembertTokenizer
import torch

# Model Loading
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

camembert_model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2)
state_dict = torch.load('camembertperso.pth', map_location='cpu')
camembert_model.load_state_dict(state_dict, strict=False)
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

nlp = spacy.load("fr_core_news_sm")

# Text Processing Functions
def clean_text(text):
    return text.strip().lower()

def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

# Prediction Functions
def predict_label(text):
    cleaned_text = clean_text(text)
    lemmatized_text = lemmatize_text(cleaned_text)

    vectorized_text = vectorizer.transform([lemmatized_text])
    label = model.predict(vectorized_text)[0]

    probability_score = model.decision_function(vectorized_text)[0]
    probability = 1 / (1 + np.exp(-probability_score))

    return label, probability

def predict_camembert(text):
    tokens = tokenizer.encode_plus(text, return_tensors="pt")

    with torch.no_grad():
        outputs = camembert_model(**tokens)

        if len(outputs) == 1:
            logits = outputs[0]
        else:
            logits = outputs[1]

        predictions = torch.argmax(logits, dim=1).item()
        probabilities = torch.softmax(logits, dim=1)[:, 1].item()

    return predictions, probabilities

# App Interface
st.title('Analyse de sentiments')

st.write('Cet outil permet de prédire si une review est positive ou négative.')

review_text = st.text_area('Saisir la review ou charger un fichier :')

if st.button('Prédire et générer le nuage de mots'):
    # LinearSVC Prediction
    label_linear_svc, probability_linear_svc = predict_label(review_text)

    # Display LinearSVC Results
    st.write('Résultats de LinearSVC:')
    if label_linear_svc == 0:
        st.write('La review est négative.')
    else:
        st.write('La review est positive.')
    
    # Display LinearSVC Prediction Score
    st.write('Score de prédiction (LinearSVC) :', f'**{label_linear_svc}**', unsafe_allow_html=True)

    # Display LinearSVC Probability
    st.write('Probabilité (LinearSVC) :', f'**{probability_linear_svc:.2%}**', unsafe_allow_html=True)

    # CamemBERT Prediction
    label_camembert, probability_camembert = predict_camembert(review_text)

    # Display CamemBERT Results
    st.write('Résultats de Camembert:')
    if label_camembert == 0:
        st.write('La review est négative.')
    else:
        st.write('La review est positive.')
    
    # Display CamemBERT Prediction Score
    st.write('Score de prédiction (Camembert) :', f'**{label_camembert}**', unsafe_allow_html=True)

    # Display CamemBERT Probability
    st.write('Probabilité (Camembert) :', f'**{probability_camembert:.2%}**', unsafe_allow_html=True)

    # Lemmatize and Exclude Stop Words
    doc = nlp(review_text)
    lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop])


    # Générer le nuage de mots
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords)
    st.image(wordcloud.to_image())

# Créer un bouton pour l'upload d'un fichier
uploaded_file = st.file_uploader("Charger un fichier texte", type=["txt", "csv"])
if uploaded_file is not None:
    content_type, _ = mimetypes.guess_type(uploaded_file.name)
    if content_type == 'text/plain':
        file_contents = uploaded_file.read().decode("utf-8")
        st.text(file_contents)

        # Lemmatiser le texte et exclure les mots vides
        doc = nlp(file_contents)
        lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop])

        # Générer le nuage de mots à partir du fichier uploadé
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords)
        st.image(wordcloud.to_image())
    elif content_type == 'text/csv':
        df = pd.read_csv(uploaded_file)
        st.write(df)