Tihsrah-CD commited on
Commit
6165314
1 Parent(s): 5a43d63

adding classifier

Browse files
Files changed (2) hide show
  1. app.py +277 -29
  2. classifer.joblib +3 -0
app.py CHANGED
@@ -1,35 +1,283 @@
1
  import streamlit as st
2
- import requests
3
- import os
4
-
5
- def find_file(filename, directory):
6
- for root, dirs, files in os.walk(directory):
7
- if filename in files:
8
- return os.path.join(root, filename)
9
- return None
 
 
 
10
 
11
  def main():
12
- st.title('Download File from OneDrive')
13
-
14
- search_filename = "english_vocab.pkl"
15
- download_link = "https://upesstd-my.sharepoint.com/:u:/g/personal/500082340_stu_upes_ac_in/EYwRTq9dcTJHppgydRR-8BMBYY2BehA6jxri5rKehcSZig?e=fjAYDf"
16
- save_filename = "classifer.joblib"
17
-
18
- found_path = find_file(search_filename, os.getcwd())
19
-
20
- if found_path:
21
- st.success(f"Found {search_filename} at {found_path}")
22
- if st.button('Download File'):
23
- response = requests.get(download_link, allow_redirects=True)
24
- if response.status_code == 200:
25
- save_path = os.path.join(os.path.dirname(found_path), save_filename)
26
- with open(save_path, 'wb') as file:
27
- file.write(response.content)
28
- st.success(f"File downloaded successfully and saved as {save_path}")
29
- else:
30
- st.error(f"Failed to download the file. Status code: {response.status_code}")
31
- else:
32
- st.error(f"File {search_filename} not found in the current directory or subdirectories.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  if __name__ == '__main__':
35
  main()
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import pickle
4
+ from tqdm import tqdm
5
+ from Levenshtein import distance as lev
6
+ import joblib
7
+ from googletrans import Translator
8
+ from indictrans import Transliterator
9
+ from pyphonetics import RefinedSoundex
10
+ import enchant
11
+ from bs4 import BeautifulSoup
12
+ import re
13
 
14
  def main():
15
+ st.title('Text Processing App')
16
+
17
+ dictn = enchant.Dict("en_US")
18
+ rs = RefinedSoundex()
19
+ normalized_string_final=[]
20
+ translator = Translator()
21
+ trn = Transliterator(source='eng', target='hin')
22
+
23
+ with open(r'./english_vocab.pkl', "rb") as fp:
24
+ english = pickle.load(fp)
25
+ english_vocab=english
26
+ with open(r'./hinglish_vocab.pkl', "rb") as fp:
27
+ hinglish = pickle.load(fp)
28
+ hinglish_vocab=hinglish
29
+
30
+ english_vocab['and'] = ['and']
31
+ english_vocab['is'] = ['is']
32
+
33
+ def clean_tweet(tweet):
34
+ text=re.sub(r'@ [A-Za-z0-9\']+','',tweet)
35
+ text=BeautifulSoup(text,'lxml').get_text()
36
+ text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text)
37
+ text=re.sub(r'https[A-Za-z0-9/. ]*','',text)
38
+ text=re.sub("[^a-zA-Z]"," ",text)
39
+ text=re.sub(r'\bRT\b',' ',text)
40
+ text=re.sub(r'\bnan\b',' ',text)
41
+ return text
42
+
43
+ input_text = st.text_area("Enter the text:")
44
+ total_translated = []
45
+ if st.button('Process'):
46
+ # Create a DataFrame with the user input text
47
+ data = {'Text': [input_text]}
48
+ df1 = pd.DataFrame(data)
49
+
50
+ # Apply the clean_tweet function to the user input text
51
+ df1['Text'] = df1['Text'].apply(clean_tweet)
52
+
53
+ # Extract the cleaned text
54
+ cleaned_text = df1['Text'].tolist()[0]
55
+
56
+ # Process the cleaned text further if needed
57
+ total_text = [cleaned_text]
58
+ st.write("Input Text:", total_text)
59
+
60
+ for i in tqdm(total_text):
61
+ test_text=i.split()
62
+
63
+ # english word change from vocab
64
+ not_changed_idx=[]
65
+ for i in range(len(test_text)):
66
+ not_changed_idx.append(0)
67
+
68
+ changed_text=[]
69
+ changed_idx=[]
70
+ # print("1st",changed_text)
71
+ for i in range(len(test_text)):
72
+
73
+ for key in english_vocab:
74
+ done=0
75
+ for val in english_vocab[key]:
76
+ if(test_text[i]==val):
77
+ # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
78
+ # print("yahan par",key,val,test_text[i])
79
+ changed_text.append(key)
80
+ changed_idx.append(i)
81
+ not_changed_idx[i]=1
82
+ done=1
83
+ # print("breaking")
84
+ break
85
+ if done==1:
86
+ # print("breaking again")
87
+ break
88
+
89
+ normalized_string=[]
90
+
91
+ # making changed text and idx to a dictionary with two lists
92
+ res = dict(zip(changed_idx, changed_text))
93
+ # print(res)
94
+ for i in range(len(test_text)):
95
+ try:
96
+ normalized_string.append(res[i])
97
+ except:
98
+ normalized_string.append(test_text[i])
99
+ print("English Normalized String : ",normalized_string)
100
+
101
+
102
+ # hinglish word change
103
+ test_list = [i for i in range(len(test_text))]
104
+ changed_hing_idx = [i for i in test_list if i not in changed_idx]
105
+ # print(changed_hing_idx)
106
+ hinglish_text_part=[]
107
+ for i in changed_hing_idx:
108
+ try:
109
+ hinglish_text_part.append(test_text[i])
110
+ except:
111
+ pass
112
+ # print(hinglish_text_part)
113
+
114
+ changed_text2=[]
115
+ changed_idx2=[]
116
+ # print("1st hing",changed_text2)
117
+ for i in range(len(hinglish_text_part)):
118
+
119
+ for key in hinglish_vocab:
120
+ done=0
121
+ for val in hinglish_vocab[key]:
122
+ if(hinglish_text_part[i]==val):
123
+ # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
124
+ # print(key,val,hinglish_text_part[i])
125
+ changed_text2.append(key)
126
+ changed_idx2.append(i)
127
+ not_changed_idx[i]=1
128
+ done=1
129
+ # print("breaking")
130
+ break
131
+ if done==1:
132
+ # print("breaking again")
133
+ break
134
+
135
+
136
+ # making changed text and idx to a dictionary with two lists
137
+ normalized_string2=[]
138
+ # print("changed_text 2 ",changed_text2)
139
+ res2 = dict(zip(changed_idx2, changed_text2))
140
+ # print(res2)
141
+ for i in range(len(hinglish_text_part)):
142
+ try:
143
+ normalized_string2.append(res2[i])
144
+ except:
145
+ normalized_string2.append(hinglish_text_part[i])
146
+ # print("normalised string 2 :",normalized_string2)
147
+
148
+
149
+ changed_idx=list(set(changed_idx))
150
+ changed_idx.sort()
151
+ # print("changed idx",changed_idx)
152
+ for i in changed_idx:
153
+ normalized_string2.append(res[i])
154
+
155
+ print("Hinglish Normalized String : ",normalized_string)
156
+ # print(not_changed_idx)
157
+
158
+
159
+ # finding phoneme and leventise distance for unchanged word
160
+
161
+ for i in range(len(not_changed_idx)):
162
+ try:
163
+ if not_changed_idx[i]==0:
164
+ eng_phoneme_correction=[]
165
+ for j in english_vocab:
166
+ # print(normalized_string2[i],j)
167
+ try:
168
+ phoneme=rs.distance(normalized_string2[i],j)
169
+ except:
170
+ pass
171
+ if phoneme<=1:
172
+ eng_phoneme_correction.append(j)
173
+ eng_lev_correction=[]
174
+ for k in eng_phoneme_correction:
175
+ dist=lev(normalized_string2[i],k)
176
+ if dist <=2:
177
+ eng_lev_correction.append(k)
178
+ # print(eng_phoneme_correction)
179
+ # print(eng_lev_correction)
180
+
181
+
182
+ hing_phoneme_correction=[]
183
+ for j in hinglish_vocab:
184
+ try:
185
+ phoneme=rs.distance(normalized_string2[i],j)
186
+ except:
187
+ pass
188
+ if phoneme<=1:
189
+ hing_phoneme_correction.append(j)
190
+ hing_lev_correction=[]
191
+ for k in hing_phoneme_correction:
192
+ dist=lev(normalized_string2[i],k)
193
+ if dist <=2:
194
+ hing_lev_correction.append(k)
195
+ # print(hing_phoneme_correction)
196
+ # print(hing_lev_correction)
197
+
198
+ eng_lev_correction.extend(hing_lev_correction)
199
+ new_correction=eng_lev_correction
200
+ eng_lev_correction=[]
201
+ # hing_lev_correction=[]
202
+ # print(eng_lev_correction)
203
+
204
+ for l in new_correction:
205
+ dist=lev(normalized_string2[i],l)
206
+ eng_lev_correction.append(dist)
207
+ min_val=min(eng_lev_correction)
208
+ min_idx=eng_lev_correction.index(min_val)
209
+
210
+
211
+ suggestion=dictn.suggest(new_correction[min_idx])
212
+ suggestion_lit=[]
213
+ for t in suggestion:
214
+ dist=lev(new_correction[min_idx],t)
215
+ suggestion_lit.append(dist)
216
+ min_suggestion_val=min(suggestion_lit)
217
+ min_suggestion_idx=suggestion_lit.index(min_suggestion_val)
218
+ # print("Suggestions : ",min_suggestion_val)
219
+ # print(suggestion[min_suggestion_idx])
220
+
221
+
222
+
223
+ normalized_string2[i]=suggestion[min_suggestion_idx]
224
+ except:
225
+ pass
226
+ normalized_string=normalized_string2
227
+ normalized_string_final=normalized_string2
228
+ print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final)
229
+ # sentence tagging
230
+ classifier=joblib.load(r"./classifer.joblib")
231
+ classify=[]
232
+ for i in normalized_string:
233
+ test_classify=classifier(i)
234
+ classify.append(test_classify[0].get("label"))
235
+
236
+ # print(normalized_string)
237
+ # print(classify)
238
+
239
+ for i in range(len(classify)):
240
+ if classify[i]=='en':
241
+ try:
242
+ normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text
243
+ except:
244
+ normalized_string[i]="delete"
245
+ print("English -> Hindi Translated String : ",normalized_string)
246
+
247
+
248
+ conversion_list=[]
249
+
250
+ for i in tqdm(normalized_string):
251
+ conversion_list.append(trn.transform(i))
252
+
253
+ print("Hinglish -> Hindi Transliterated String : ",conversion_list)
254
+ conversion_list=normalized_string
255
+ string=""
256
+ sentence=[]
257
+ for i in conversion_list:
258
+ string=i+' '+string
259
+ sentence.append(string)
260
+ translated=[]
261
+ for i in tqdm(sentence):
262
+ try:
263
+ translated_text = translator.translate(i ,src='hi',dest='en')
264
+ translated.append(translated_text.text)
265
+ except:
266
+ translated.append("delete")
267
+ print("Hindi -> English Translated String : ",translated)
268
+ total_translated.append(translated[0])
269
+
270
+ total_translated=pd.DataFrame(total_translated)
271
+
272
+
273
+
274
+
275
+ st.write("English Normalized String:", normalized_string)
276
+ st.write("Hinglish Normalized String:", normalized_string)
277
+ st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
278
+ st.write("English -> Hindi Translated String:", normalized_string)
279
+ st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
280
+ st.write("Hindi -> English Translated String:", translated)
281
 
282
  if __name__ == '__main__':
283
  main()
classifer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e633bdc6d6fab55414874aa40d34731e3c899a45b440689f3db3808dbe76a6
3
+ size 1121416288