cnmoro commited on
Commit
bb73643
1 Parent(s): b218a36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -5
app.py CHANGED
@@ -3,17 +3,15 @@ from minivectordb.embedding_model import EmbeddingModel
3
  from minivectordb.vector_database import VectorDatabase
4
  from multiprocessing import cpu_count
5
  from functools import lru_cache
6
- import fasttext, random, nltk, tiktoken, os
7
  import concurrent.futures
8
- nltk.download('stopwords')
9
- from nltk.corpus import stopwords
10
 
11
  os.environ['TOKENIZERS_PARALLELISM'] = 'true'
12
 
13
  langdetect_model = fasttext.load_model('lid.176.ftz')
14
  embedding_model = EmbeddingModel(onnx_model_cpu_core_count=1)
15
- en_stop_words = stopwords.words('english')
16
- pt_stop_words = stopwords.words('portuguese')
17
  tokenizer = tiktoken.encoding_for_model("gpt-4")
18
 
19
  def count_tokens_tiktoken(text):
 
3
  from minivectordb.vector_database import VectorDatabase
4
  from multiprocessing import cpu_count
5
  from functools import lru_cache
6
+ import fasttext, random, tiktoken, os, pickle
7
  import concurrent.futures
 
 
8
 
9
  os.environ['TOKENIZERS_PARALLELISM'] = 'true'
10
 
11
  langdetect_model = fasttext.load_model('lid.176.ftz')
12
  embedding_model = EmbeddingModel(onnx_model_cpu_core_count=1)
13
+ en_stop_words = pickle.load(open("en_stopwords.pkl", "rb"))
14
+ pt_stop_words = pickle.load(open("pt_stopwords.pkl", "rb"))
15
  tokenizer = tiktoken.encoding_for_model("gpt-4")
16
 
17
  def count_tokens_tiktoken(text):