sy-lac commited on
Commit
f0d8b59
1 Parent(s): f5f2178

text_to_summary

Browse files
Files changed (2) hide show
  1. app.py +169 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from collections import Counter
3
+ import tensorflow as tf
4
+ import keras
5
+ from tensorflow.keras.preprocessing import text,sequence
6
+ from tensorflow.keras.preprocessing.text import Tokenizer
7
+
8
+ import nltk
9
+ nltk.download('punkt')
10
+ from nltk.tokenize import word_tokenize
11
+ nltk.download('stopwords')
12
+ from nltk.corpus import stopwords
13
+ nltk.download('wordnet')
14
+ from nltk.stem import WordNetLemmatizer
15
+
16
+ from textblob import TextBlob, Blobber
17
+ from textblob_fr import PatternTagger, PatternAnalyzer
18
+
19
+ import spacy.cli
20
+ spacy.cli.download("fr_core_news_md")
21
+
22
+ import torch
23
+ import sentencepiece as spm
24
+ from transformers import CamembertTokenizer, CamembertModel
25
+ from nltk.tokenize import sent_tokenize
26
+ from sklearn.metrics.pairwise import cosine_similarity
27
+
28
+
29
+ # nombre de mots et de mots uniques
30
+ def number_words(text):
31
+ word = text.split()
32
+ return f'Nombre de mots : {len(word)}', f'Nombre de mots uniques : {len(Counter(word))}'
33
+
34
+
35
+ # polarité
36
+ def polarity(text):
37
+ tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
38
+ if tb(text).sentiment[0] < 0:
39
+ return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus négatif que positif'
40
+ elif tb(text).sentiment[0] > 0:
41
+ return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus positif que négatif'
42
+ else :
43
+ return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est neutre, pas plus négatif que positif'
44
+
45
+
46
+ # subjectivité
47
+ def subjectivity(text):
48
+ tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
49
+ if tb(text).sentiment[1] < 0.5:
50
+ return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
51
+ elif tb(text).sentiment[1] > 0.5:
52
+ return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
53
+ else :
54
+ return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est neutre, pas plus subjectif que factuel'
55
+
56
+
57
+ # mots clés
58
+ def keywords(text):
59
+ nlp = spacy.load("fr_core_news_md")
60
+ text2 = nlp(text)
61
+ text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN' or token.pos_== 'VERB']
62
+ counter_words = Counter(text_keywords)
63
+ most_freq_words = [word for word in counter_words.most_common(10)]
64
+ most_freq_words_p = []
65
+ for i in range(len(most_freq_words)):
66
+ mfwp = most_freq_words[i][0]
67
+ most_freq_words_p.append(mfwp)
68
+ return 'mots clés :', ', '.join(most_freq_words_p)
69
+
70
+
71
+ # summary1
72
+ def summary_1(text):
73
+ model = CamembertModel.from_pretrained('camembert-base')
74
+ tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
75
+
76
+ ## preprocessing
77
+ sentences = sent_tokenize(text)
78
+ tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
79
+
80
+ ## padding, encoding
81
+ max_len = 0
82
+ for i in tokenized_sentences:
83
+ if len(i) > max_len:
84
+ max_len = len(i)
85
+
86
+ padded_sentences = []
87
+ for i in tokenized_sentences:
88
+ while len(i) < max_len:
89
+ i.append(0)
90
+ padded_sentences.append(i)
91
+
92
+ input_ids = torch.tensor(padded_sentences)
93
+
94
+ ## embedding
95
+ with torch.no_grad():
96
+ last_hidden_states = model(input_ids)[0]
97
+
98
+ sentence_embeddings = []
99
+ for i in range(len(sentences)):
100
+ sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())
101
+
102
+ ## summarizing
103
+ similarity_matrix = cosine_similarity(sentence_embeddings)
104
+
105
+ num_sentences = 2
106
+ summary_sentences = []
107
+ for i in range(num_sentences):
108
+ sentence_scores = list(enumerate(similarity_matrix[i]))
109
+ sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
110
+ summary_sentences.append(sentences[sentence_scores[1][0]])
111
+
112
+ summary1 = ' '.join(summary_sentences)
113
+ return summary1
114
+
115
+
116
+ # summary2
117
+ def summary_2(text):
118
+ nlp = spacy.load("fr_core_news_md")
119
+ text2 = nlp(text)
120
+ text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN']
121
+
122
+ counter_words = Counter(text_keywords)
123
+ most_freq_words = [word for word in counter_words.most_common(3)]
124
+
125
+ most_freq_words_p = []
126
+ for i in range(len(most_freq_words)):
127
+ mfwp = most_freq_words[i][0]
128
+ most_freq_words_p.append(mfwp)
129
+
130
+ sentences = sent_tokenize(text)
131
+ summary2 = []
132
+
133
+ for sent in sentences:
134
+ words_in_sentence = word_tokenize(sent)
135
+ common_words = set(words_in_sentence).intersection(most_freq_words)
136
+
137
+ if common_words:
138
+ summary2.append(sent)
139
+
140
+ return summary2
141
+
142
+
143
+ def analyze_text(text):
144
+ nb_mots = number_words(text)
145
+ polarite = polarity(text)
146
+ subjectivite = subjectivity(text)
147
+ mots_cles = keywords(text)
148
+ resume1 = summary_1(text)
149
+ resume2 = summary_2(text)
150
+
151
+ return nb_mots, polarite, subjectivite, mots_cles, resume1, resume2
152
+
153
+
154
+ st.title("Text Analysis and Summary")
155
+ text = st.text_area("Enter text here:")
156
+
157
+ if st.button("Analyze"):
158
+ if text:
159
+ nb_mots, polarite, subjectivite, mots_cles, resume1, resume2 = analyze_text(text)
160
+
161
+ st.write(f'Nombre de mots : {nb_mots}')
162
+ st.write(f'Polarité : {polarite}')
163
+ st.write(f'Subjectivité : {subjectivite}')
164
+ st.write(f'Mots clés : {", ".join(mots_cles)}')
165
+ st.write(f'Résumé 1 : {resume1}')
166
+ st.write(f'Résumé 2 :')
167
+ for sent in resume2:
168
+ st.write(sent)
169
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ nltk
2
+ textblob
3
+ textblob-fr
4
+ sentencepiece
5
+ transformers
6
+ spacy
7
+ torch
8
+ scikit-learn
9
+ streamlit