Spaces:

Tihsrah-CD
/

Hinglish-Text-Normalizer

Sleeping

App Files Files Community

Tihsrah-CD commited on Aug 14, 2023

Commit

6165314

•

1 Parent(s): 5a43d63

adding classifier

Browse files

Files changed (2) hide show

app.py +277 -29
classifer.joblib +3 -0

app.py CHANGED Viewed

@@ -1,35 +1,283 @@
 import streamlit as st
-import requests
-import os
-def find_file(filename, directory):
-    for root, dirs, files in os.walk(directory):
-        if filename in files:
-            return os.path.join(root, filename)
-    return None
 def main():
-    st.title('Download File from OneDrive')
-    search_filename = "english_vocab.pkl"
-    download_link = "https://upesstd-my.sharepoint.com/:u:/g/personal/500082340_stu_upes_ac_in/EYwRTq9dcTJHppgydRR-8BMBYY2BehA6jxri5rKehcSZig?e=fjAYDf"
-    save_filename = "classifer.joblib"
-    found_path = find_file(search_filename, os.getcwd())
-    if found_path:
-        st.success(f"Found {search_filename} at {found_path}")
-        if st.button('Download File'):
-            response = requests.get(download_link, allow_redirects=True)
-            if response.status_code == 200:
-                save_path = os.path.join(os.path.dirname(found_path), save_filename)
-                with open(save_path, 'wb') as file:
-                    file.write(response.content)
-                st.success(f"File downloaded successfully and saved as {save_path}")
-            else:
-                st.error(f"Failed to download the file. Status code: {response.status_code}")
-    else:
-        st.error(f"File {search_filename} not found in the current directory or subdirectories.")
 if __name__ == '__main__':
     main()

 import streamlit as st
+import pandas as pd
+import pickle
+from tqdm import tqdm
+from Levenshtein import distance as lev
+import joblib
+from googletrans import Translator
+from indictrans import Transliterator
+from pyphonetics import RefinedSoundex
+import enchant
+from bs4 import BeautifulSoup
+import re
 def main():
+    st.title('Text Processing App')
+    dictn = enchant.Dict("en_US")
+    rs = RefinedSoundex()
+    normalized_string_final=[]
+    translator = Translator()
+    trn = Transliterator(source='eng', target='hin')
+    with open(r'./english_vocab.pkl', "rb") as fp:
+       english = pickle.load(fp)
+    english_vocab=english
+    with open(r'./hinglish_vocab.pkl', "rb") as fp:
+       hinglish = pickle.load(fp)
+    hinglish_vocab=hinglish
+    english_vocab['and'] = ['and']
+    english_vocab['is'] = ['is']
+    def clean_tweet(tweet):
+        text=re.sub(r'@ [A-Za-z0-9\']+','',tweet)
+        text=BeautifulSoup(text,'lxml').get_text()
+        text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text)
+        text=re.sub(r'https[A-Za-z0-9/. ]*','',text)
+        text=re.sub("[^a-zA-Z]"," ",text)
+        text=re.sub(r'\bRT\b',' ',text)
+        text=re.sub(r'\bnan\b',' ',text)
+        return text
+    input_text = st.text_area("Enter the text:")
+    total_translated = []
+    if st.button('Process'):
+        # Create a DataFrame with the user input text
+        data = {'Text': [input_text]}
+        df1 = pd.DataFrame(data)
+        # Apply the clean_tweet function to the user input text
+        df1['Text'] = df1['Text'].apply(clean_tweet)
+        # Extract the cleaned text
+        cleaned_text = df1['Text'].tolist()[0]
+        # Process the cleaned text further if needed
+        total_text = [cleaned_text]
+        st.write("Input Text:", total_text)
+        for i in tqdm(total_text):
+            test_text=i.split()
+            # english word change from vocab
+            not_changed_idx=[]
+            for i in range(len(test_text)):
+                not_changed_idx.append(0)
+            changed_text=[]
+            changed_idx=[]
+        #     print("1st",changed_text)
+            for i in range(len(test_text)):
+                for key in english_vocab:
+                    done=0
+                    for val in  english_vocab[key]:
+                        if(test_text[i]==val):
+                            # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
+        #                     print("yahan par",key,val,test_text[i])
+                            changed_text.append(key)
+                            changed_idx.append(i)
+                            not_changed_idx[i]=1
+                            done=1
+                            # print("breaking")
+                            break
+                    if done==1:
+                        # print("breaking again")
+                        break
+            normalized_string=[]
+            # making changed text and idx to a dictionary with two lists
+            res = dict(zip(changed_idx, changed_text))
+        #     print(res)
+            for i in range(len(test_text)):
+                try:
+                    normalized_string.append(res[i])
+                except:
+                    normalized_string.append(test_text[i])
+            print("English Normalized String : ",normalized_string)
+            # hinglish word change
+            test_list = [i for i in range(len(test_text))]
+            changed_hing_idx = [i for i in test_list if i not in changed_idx]
+            # print(changed_hing_idx)
+            hinglish_text_part=[]
+            for i in changed_hing_idx:
+                try:
+                    hinglish_text_part.append(test_text[i])
+                except:
+                    pass
+        #     print(hinglish_text_part)
+            changed_text2=[]
+            changed_idx2=[]
+        #     print("1st hing",changed_text2)
+            for i in range(len(hinglish_text_part)):
+                for key in hinglish_vocab:
+                    done=0
+                    for val in  hinglish_vocab[key]:
+                        if(hinglish_text_part[i]==val):
+                            # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
+        #                     print(key,val,hinglish_text_part[i])
+                            changed_text2.append(key)
+                            changed_idx2.append(i)
+                            not_changed_idx[i]=1
+                            done=1
+                            # print("breaking")
+                            break
+                    if done==1:
+                        # print("breaking again")
+                        break
+            # making changed text and idx to a dictionary with two lists
+            normalized_string2=[]
+        #     print("changed_text 2 ",changed_text2)
+            res2 = dict(zip(changed_idx2, changed_text2))
+        #     print(res2)
+            for i in range(len(hinglish_text_part)):
+                try:
+                    normalized_string2.append(res2[i])
+                except:
+                    normalized_string2.append(hinglish_text_part[i])
+        #     print("normalised string 2 :",normalized_string2)
+            changed_idx=list(set(changed_idx))
+            changed_idx.sort()
+        #     print("changed idx",changed_idx)
+            for i in changed_idx:
+                normalized_string2.append(res[i])
+            print("Hinglish Normalized String : ",normalized_string)
+        #     print(not_changed_idx)
+            # finding phoneme and leventise distance for unchanged word
+            for i in range(len(not_changed_idx)):
+                try:
+                    if not_changed_idx[i]==0:
+                        eng_phoneme_correction=[]
+                        for j in english_vocab:
+                            # print(normalized_string2[i],j)
+                            try:
+                                phoneme=rs.distance(normalized_string2[i],j)
+                            except:
+                                pass
+                            if phoneme<=1:
+                                eng_phoneme_correction.append(j)
+                        eng_lev_correction=[]
+                        for k in eng_phoneme_correction:
+                            dist=lev(normalized_string2[i],k)
+                            if dist <=2:
+                                eng_lev_correction.append(k)
+        #                 print(eng_phoneme_correction)
+        #                 print(eng_lev_correction)
+                        hing_phoneme_correction=[]
+                        for j in hinglish_vocab:
+                            try:
+                                phoneme=rs.distance(normalized_string2[i],j)
+                            except:
+                                pass
+                            if phoneme<=1:
+                                hing_phoneme_correction.append(j)
+                        hing_lev_correction=[]
+                        for k in hing_phoneme_correction:
+                            dist=lev(normalized_string2[i],k)
+                            if dist <=2:
+                                hing_lev_correction.append(k)
+        #                 print(hing_phoneme_correction)
+        #                 print(hing_lev_correction)
+                        eng_lev_correction.extend(hing_lev_correction)
+                        new_correction=eng_lev_correction
+                        eng_lev_correction=[]
+                        # hing_lev_correction=[]
+        #                 print(eng_lev_correction)
+                        for l in new_correction:
+                            dist=lev(normalized_string2[i],l)
+                            eng_lev_correction.append(dist)
+                        min_val=min(eng_lev_correction)
+                        min_idx=eng_lev_correction.index(min_val)
+                        suggestion=dictn.suggest(new_correction[min_idx])
+                        suggestion_lit=[]
+                        for t in suggestion:
+                            dist=lev(new_correction[min_idx],t)
+                            suggestion_lit.append(dist)
+                        min_suggestion_val=min(suggestion_lit)
+                        min_suggestion_idx=suggestion_lit.index(min_suggestion_val)
+        #                 print("Suggestions : ",min_suggestion_val)
+        #                 print(suggestion[min_suggestion_idx])
+                        normalized_string2[i]=suggestion[min_suggestion_idx]
+                except:
+                    pass
+            normalized_string=normalized_string2
+            normalized_string_final=normalized_string2
+            print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final)
+            # sentence tagging
+            classifier=joblib.load(r"./classifer.joblib")
+            classify=[]
+            for i in normalized_string:
+                test_classify=classifier(i)
+                classify.append(test_classify[0].get("label"))
+        #     print(normalized_string)
+        #     print(classify)
+            for i in range(len(classify)):
+                if classify[i]=='en':
+                    try:
+                        normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text
+                    except:
+                        normalized_string[i]="delete"
+            print("English -> Hindi Translated String : ",normalized_string)
+            conversion_list=[]
+            for i in tqdm(normalized_string):
+                conversion_list.append(trn.transform(i))
+            print("Hinglish -> Hindi Transliterated String : ",conversion_list)
+            conversion_list=normalized_string
+            string=""
+            sentence=[]
+            for i in conversion_list:
+                string=i+' '+string
+            sentence.append(string)
+            translated=[]
+            for i in tqdm(sentence):
+                try:
+                    translated_text = translator.translate(i ,src='hi',dest='en')
+                    translated.append(translated_text.text)
+                except:
+                    translated.append("delete")
+            print("Hindi -> English Translated String : ",translated)
+            total_translated.append(translated[0])
+            total_translated=pd.DataFrame(total_translated)
+        st.write("English Normalized String:", normalized_string)
+        st.write("Hinglish Normalized String:", normalized_string)
+        st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
+        st.write("English -> Hindi Translated String:", normalized_string)
+        st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
+        st.write("Hindi -> English Translated String:", translated)
 if __name__ == '__main__':
     main()

classifer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04e633bdc6d6fab55414874aa40d34731e3c899a45b440689f3db3808dbe76a6
+size 1121416288