cnmoro commited on
Commit
96d2af2
1 Parent(s): 4dd98b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -118
app.py CHANGED
@@ -1,147 +1,102 @@
1
- import gradio as gr
 
2
  from minivectordb.embedding_model import EmbeddingModel
3
- from minivectordb.vector_database import VectorDatabase
4
- from multiprocessing import cpu_count
5
- from functools import lru_cache
6
- import fasttext, random, tiktoken, os, pickle
7
- import concurrent.futures
8
 
9
- os.environ['TOKENIZERS_PARALLELISM'] = 'true'
 
10
 
11
  langdetect_model = fasttext.load_model('lid.176.ftz')
12
- embedding_model = EmbeddingModel(onnx_model_cpu_core_count=1)
13
- en_stop_words = pickle.load(open("en_stopwords.pkl", "rb"))
14
- pt_stop_words = pickle.load(open("pt_stopwords.pkl", "rb"))
15
  tokenizer = tiktoken.encoding_for_model("gpt-4")
16
 
17
  def count_tokens_tiktoken(text):
18
  return len(tokenizer.encode(text))
19
 
20
- def detect_language_en_pt(text):
21
  detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
22
- result = str(detected_lang).replace('__label__', '')
23
- if result == 'pt':
24
- return 'pt'
25
- return 'en'
26
-
27
- def generate_combinations(text, word_reduction_factor, stopwords, semantic_embeddings, num_samples=100, keep_tokens=None):
28
- if keep_tokens is None:
29
- keep_tokens = {"\n", ".", ",", ";", "!", "?"}
30
-
31
- if word_reduction_factor is None:
32
- word_reduction_factor = 0.5
33
-
34
- words = text.split()
35
- total_words = len(words)
36
- num_remove = int(total_words * word_reduction_factor)
37
-
38
- # Update index identification to exclude keep_tokens
39
- stopword_indices = [i for i, word in enumerate(words) if word.lower() in stopwords and word not in keep_tokens]
40
- non_stopword_indices = [i for i, word in enumerate(words) if word.lower() not in stopwords and word not in keep_tokens]
41
-
42
- non_stopword_words = [word for i, word in enumerate(words) if i in non_stopword_indices]
43
-
44
- # Get the embeddings for the non-stopword words
45
- non_stopword_embeddings = extract_embeddings_batch(non_stopword_words)
 
 
 
 
 
 
 
 
46
 
47
- # Calculate the cosine similarity between the original text embedding and the non-stopword words
48
- original_text_embedding = semantic_embeddings
49
 
50
- # Calculate the cosine similarity between the original text embedding and the non-stopword words
51
- semantic_db = VectorDatabase()
52
- ids = [i for i in range(len(non_stopword_words))]
53
- metadata_dicts = [{"w": word} for word in non_stopword_words]
54
- semantic_db.store_embeddings_batch(ids, non_stopword_embeddings, metadata_dicts)
55
 
56
- _, _, ordered_words_metadata = semantic_db.find_most_similar(original_text_embedding, k=len(non_stopword_words))
57
- ordered_words = [meta['w'] for meta in ordered_words_metadata]
58
 
59
- # Create a mapping from word to index for quick lookup
60
- word_to_index = {word: i for i, word in enumerate(words)}
61
 
62
- # Get the ordered indices based on semantic importance (less important words last)
63
- ordered_indices = [word_to_index[word] for word in ordered_words if word in word_to_index]
 
64
 
65
- # Determine the high-priority words to always keep
66
- high_priority_count = len(ordered_indices) - num_remove
67
- high_priority_count = max(high_priority_count, 0) # Ensure it's not negative
68
- high_priority_indices = ordered_indices[:high_priority_count]
69
 
70
- combinations = []
71
- for _ in range(num_samples):
72
- # Calculate remaining words to remove
73
- remaining_remove = num_remove
74
 
75
- # Ensure we don't try to sample more items than exist
76
- if len(stopword_indices) > 0:
77
- num_stop = random.randint(0, min(remaining_remove, len(stopword_indices)))
78
- else:
79
- num_stop = 0
80
-
81
- remaining_remove -= num_stop
82
-
83
- if remaining_remove > 0:
84
- lower_priority_indices = ordered_indices[high_priority_count:]
85
- num_non_stop = min(remaining_remove, len(lower_priority_indices)) # Ensure we don't sample more than available
86
- prioritized_non_stop_indices = random.sample(lower_priority_indices, num_non_stop) if num_non_stop > 0 else []
87
  else:
88
- prioritized_non_stop_indices = []
89
-
90
- stop_comb = random.sample(stopword_indices, num_stop) if num_stop > 0 else []
91
- combination = set(stop_comb + prioritized_non_stop_indices)
92
-
93
- new_string = [word for i, word in enumerate(words) if i not in combination or i in high_priority_indices]
94
- combinations.append(' '.join(new_string))
95
-
96
- return list(set(combinations))
97
-
98
- @lru_cache(maxsize=50000)
99
- def extract_embeddings(text):
100
- return embedding_model.extract_embeddings(text)
101
-
102
- def extract_embeddings_batch(texts):
103
- return [extract_embeddings(text) for text in texts]
104
 
105
- def compress_semantically(input_text, word_reduction_factor=0.35):
 
106
 
107
- num_samples = 500
108
- word_count = len(input_text.split())
109
 
110
- thresholds = [(1500, 80), (1000, 90), (700, 110), (500, 130), (250, 160)]
111
- for threshold, value in thresholds:
112
- if word_count > threshold:
113
- num_samples = value
114
- break
115
-
116
- semantic_embeddings = extract_embeddings(input_text)
117
- text_lang = detect_language_en_pt(input_text)
118
- stopwords = en_stop_words if text_lang == 'en' else pt_stop_words
119
- text_combinations = generate_combinations(input_text, word_reduction_factor, stopwords, semantic_embeddings, num_samples=num_samples)
120
-
121
- n = int(num_samples / cpu_count())
122
- # Aggregate text_combinations into blocks of "n"
123
- text_combinations_chunks = [text_combinations[i:i + n] for i in range(0, len(text_combinations), n)]
124
-
125
- # Calculate the embeddings for each combination
126
- combinations_embeddings = []
127
- with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor:
128
- for embeddings in executor.map(extract_embeddings_batch, text_combinations_chunks):
129
- combinations_embeddings.extend(embeddings)
130
-
131
- semantic_db = VectorDatabase()
132
- unique_ids = [ i for i in range(len(text_combinations)) ]
133
- metadata_dicts = [ {"text": text} for text in text_combinations ]
134
- semantic_db.store_embeddings_batch(unique_ids, combinations_embeddings, metadata_dicts)
135
-
136
- _, _, result = semantic_db.find_most_similar(semantic_embeddings, k=1)
137
- best_compressed_sentence = result[0]['text']
138
- return best_compressed_sentence
139
 
140
  async def predict(text, word_reduction_factor):
141
  if len(text.split()) > 700:
142
  return "Text is too long for this demo. Please provide a text with less than 700 words."
143
 
144
- compressed = compress_semantically(text, word_reduction_factor = word_reduction_factor)
145
  perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
146
 
147
  return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
@@ -162,7 +117,7 @@ reduction_factor = gr.Slider(
162
  value=0.5,
163
  step=0.05,
164
  interactive=True,
165
- label="Word Reduction Factor"
166
  )
167
  # Create the gradio interface
168
  gr.Interface(
 
1
+ from sklearn.feature_extraction.text import CountVectorizer
2
+ from sklearn.decomposition import LatentDirichletAllocation
3
  from minivectordb.embedding_model import EmbeddingModel
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import tiktoken, nltk, numpy as np, fasttext, pickle
6
+ from nltk.tokenize import sent_tokenize
7
+ import gradio as gr
 
8
 
9
+ nltk.download('punkt')
10
+ nltk.download('stopwords')
11
 
12
  langdetect_model = fasttext.load_model('lid.176.ftz')
13
+ embedding_model = EmbeddingModel(onnx_model_cpu_core_count=2)
14
+ english_stopwords = pickle.load(open("en_stopwords.pkl", "rb"))
15
+ portuguese_stopwords = pickle.load(open("pt_stopwords.pkl", "rb"))
16
  tokenizer = tiktoken.encoding_for_model("gpt-4")
17
 
18
  def count_tokens_tiktoken(text):
19
  return len(tokenizer.encode(text))
20
 
21
+ def detect_language(text):
22
  detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
23
+ return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
24
+
25
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
26
+ def calculate_similarity(embed1, embed2):
27
+ return cosine_similarity([embed1], [embed2])[0][0]
28
+
29
+ def create_lda_model(texts, stopwords):
30
+ vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
31
+ doc_term_matrix = vectorizer.fit_transform(texts)
32
+ lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
33
+ lda.fit(doc_term_matrix)
34
+ return lda, vectorizer
35
+
36
+ def get_topic_distribution(text, lda, vectorizer):
37
+ vec = vectorizer.transform([text])
38
+ return lda.transform(vec)[0]
39
+
40
+ def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
41
+ sentence_embedding = embedding_model.extract_embeddings(sentence)
42
+ semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
43
+
44
+ topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
45
+ topic_importance = np.max(topic_dist)
46
+
47
+ # Calculate lexical diversity
48
+ words = sentence.split()
49
+ unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
50
+ lexical_diversity = len(unique_words) / len(words) if words else 0
51
+
52
+ # Combine factors (you can adjust weights as needed)
53
+ importance = (0.4 * semantic_similarity) + (0.4 * topic_importance) + (0.2 * lexical_diversity)
54
+ return importance
55
 
56
+ # Split the text into sentences
57
+ sentences = sent_tokenize(full_text)
58
 
59
+ text_lang = detect_language(full_text)
 
 
 
 
60
 
61
+ # Create LDA model
62
+ lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
63
 
64
+ # Get document-level embedding
65
+ doc_embedding = embedding_model.extract_embeddings(full_text)
66
 
67
+ # Calculate importance for each sentence
68
+ sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
69
+ for sentence in sentences]
70
 
71
+ # Sort sentences by importance
72
+ sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
 
 
73
 
74
+ # Determine how many words to keep
75
+ total_words = sum(len(sentence.split()) for sentence in sentences)
76
+ target_words = int(total_words * compression_rate)
 
77
 
78
+ # Reconstruct the compressed text
79
+ compressed_text = []
80
+ current_words = 0
81
+ for sentence, _ in sorted_sentences:
82
+ sentence_words = len(sentence.split())
83
+ if current_words + sentence_words <= target_words:
84
+ compressed_text.append(sentence)
85
+ current_words += sentence_words
 
 
 
 
86
  else:
87
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # Reorder sentences to maintain original flow
90
+ compressed_text.sort(key=lambda x: sentences.index(x))
91
 
92
+ return ' '.join(compressed_text)
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  async def predict(text, word_reduction_factor):
96
  if len(text.split()) > 700:
97
  return "Text is too long for this demo. Please provide a text with less than 700 words."
98
 
99
+ compressed = semantic_compress_text(text, word_reduction_factor = 1 - word_reduction_factor)
100
  perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
101
 
102
  return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
 
117
  value=0.5,
118
  step=0.05,
119
  interactive=True,
120
+ label="Reduction Factor"
121
  )
122
  # Create the gradio interface
123
  gr.Interface(