File size: 4,424 Bytes
3daec5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f592e0b
3daec5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import spacy
from wordcloud import WordCloud
from io import StringIO, BytesIO
import mimetypes
from transformers import CamembertForSequenceClassification, CamembertTokenizer
import torch

# Model Loading
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

camembert_model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2)
state_dict = torch.load('camembertperso.pth', map_location='cpu')
camembert_model.load_state_dict(state_dict, strict=False)
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

nlp = spacy.load("fr_core_news_sm")

# Text Processing Functions
def clean_text(text):
    return text.strip().lower()

def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

# Prediction Functions
def predict_label(text):
    cleaned_text = clean_text(text)
    lemmatized_text = lemmatize_text(cleaned_text)

    vectorized_text = vectorizer.transform([lemmatized_text])
    label = model.predict(vectorized_text)[0]

    probability_score = model.decision_function(vectorized_text)[0]
    probability = 1 / (1 + np.exp(-probability_score))

    return label, probability

def predict_camembert(text):
    tokens = tokenizer.encode_plus(text, return_tensors="pt")

    with torch.no_grad():
        outputs = camembert_model(**tokens)

        if len(outputs) == 1:
            logits = outputs[0]
        else:
            logits = outputs[1]

        predictions = torch.argmax(logits, dim=1).item()
        probabilities = torch.softmax(logits, dim=1)[:, 1].item()

    return predictions, probabilities

# App Interface
st.title('Analyse de sentiments')

st.write('Cet outil permet de prédire si une review est positive ou négative.')

review_text = st.text_area('Saisir la review ou charger un fichier :')

if st.button('Prédire et générer le nuage de mots'):
    # LinearSVC Prediction
    label_linear_svc, probability_linear_svc = predict_label(review_text)

    # Display LinearSVC Results
    st.write('Résultats de LinearSVC:')
    if label_linear_svc == 0:
        st.write('La review est négative.')
    else:
        st.write('La review est positive.')
    
    # Display LinearSVC Prediction Score
    st.write('Score de prédiction (LinearSVC) :', f'**{label_linear_svc}**', unsafe_allow_html=True)

    # Display LinearSVC Probability
    st.write('Probabilité (LinearSVC) :', f'**{probability_linear_svc:.2%}**', unsafe_allow_html=True)

    # CamemBERT Prediction
    label_camembert, probability_camembert = predict_camembert(review_text)

    # Display CamemBERT Results
    st.write('Résultats de Camembert:')
    if label_camembert == 0:
        st.write('La review est négative.')
    else:
        st.write('La review est positive.')
    
    # Display CamemBERT Prediction Score
    st.write('Score de prédiction (Camembert) :', f'**{label_camembert}**', unsafe_allow_html=True)

    # Display CamemBERT Probability
    st.write('Probabilité (Camembert) :', f'**{probability_camembert:.2%}**', unsafe_allow_html=True)

    # Lemmatize and Exclude Stop Words
    doc = nlp(review_text)
    lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop])


    # Générer le nuage de mots
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords)
    st.image(wordcloud.to_image())

# Créer un bouton pour l'upload d'un fichier
uploaded_file = st.file_uploader("Charger un fichier texte", type=["txt", "csv"])
if uploaded_file is not None:
    content_type, _ = mimetypes.guess_type(uploaded_file.name)
    if content_type == 'text/plain':
        file_contents = uploaded_file.read().decode("utf-8")
        st.text(file_contents)

        # Lemmatiser le texte et exclure les mots vides
        doc = nlp(file_contents)
        lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop])

        # Générer le nuage de mots à partir du fichier uploadé
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords)
        st.image(wordcloud.to_image())
    elif content_type == 'text/csv':
        df = pd.read_csv(uploaded_file)
        st.write(df)