Shchushch commited on
Commit
a7055be
·
1 Parent(s): 50872cb
Files changed (1) hide show
  1. find.py +3 -4
find.py CHANGED
@@ -15,9 +15,8 @@ from tqdm import tqdm
15
  tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
16
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
17
 
18
- nltk.download('stopwords')
19
-
20
- eng_stop_words = stopwords.words('english')
21
  with open('russian.txt', 'r') as f:
22
  ru_stop_words = f.read()
23
 
@@ -99,7 +98,7 @@ def clean(text: str)-> str:
99
  text = ''.join(c for c in text if c in allow)
100
  text= text.split()
101
  text = [word for word in text if word.lower() not in ru_stop_words]
102
- text = [word for word in text if word.lower() not in eng_stop_words]
103
  return ' '.join(text)
104
 
105
 
 
15
  tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
16
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
17
 
18
+ # nltk.download('stopwords')
19
+ #eng_stop_words = stopwords.words('english')
 
20
  with open('russian.txt', 'r') as f:
21
  ru_stop_words = f.read()
22
 
 
98
  text = ''.join(c for c in text if c in allow)
99
  text= text.split()
100
  text = [word for word in text if word.lower() not in ru_stop_words]
101
+ #text = [word for word in text if word.lower() not in eng_stop_words]
102
  return ' '.join(text)
103
 
104