kargaranamir commited on
Commit
b21d736
1 Parent(s): 44dbf52

add preprocess.

Browse files
Files changed (1) hide show
  1. app.py +37 -2
app.py CHANGED
@@ -6,7 +6,7 @@
6
  # This space is built based on AMR-KELEG/ALDi space.
7
  # GlotLID Space
8
 
9
-
10
  import constants
11
  import pandas as pd
12
  import streamlit as st
@@ -19,6 +19,7 @@ from altair import X, Y, Scale
19
  import base64
20
  import json
21
  import os
 
22
 
23
  @st.cache_resource
24
  def load_sp():
@@ -45,9 +46,42 @@ def get_script(text):
45
  else:
46
  all_scripts = 'Zyyy'
47
 
 
 
 
 
 
48
  return main_script, all_scripts
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  @st.cache_data
52
  def language_names(json_path):
53
  with open(json_path, 'r') as json_file:
@@ -161,6 +195,8 @@ def compute(sentences, version = 'v2'):
161
  probs = []
162
  labels = []
163
 
 
 
164
  for index, sent in enumerate(sentences):
165
 
166
  output = model_choice.predict(sent)
@@ -227,7 +263,6 @@ with tab1:
227
  clicked = st.button("Submit")
228
 
229
  if sent:
230
- sent = sent.replace('\n', ' ')
231
 
232
  probs, labels = compute([sent], version=version)
233
  prob = probs[0]
 
6
  # This space is built based on AMR-KELEG/ALDi space.
7
  # GlotLID Space
8
 
9
+ import string
10
  import constants
11
  import pandas as pd
12
  import streamlit as st
 
19
  import base64
20
  import json
21
  import os
22
+ import re
23
 
24
  @st.cache_resource
25
  def load_sp():
 
46
  else:
47
  all_scripts = 'Zyyy'
48
 
49
+ for ws in all_scripts:
50
+ if ws in ['Kana', 'Hrkt', 'Hani', 'Hira']:
51
+ all_scripts.append('Jpan')
52
+
53
+ all_scripts = list(set(all_scripts))
54
  return main_script, all_scripts
55
 
56
 
57
+ def preprocess_text(text):
58
+ """Apply preprocessing to the given text.
59
+ Args:
60
+ text: Thetext to be preprocessed.
61
+ Returns:
62
+ The preprocessed text.
63
+ """
64
+
65
+ # remove \n
66
+ text = text.replace('\n', ' ')
67
+
68
+ # get rid of characters that are ubiquitous
69
+ replace_by = " "
70
+ replacement_map = {
71
+ ord(c): replace_by
72
+ for c in ':•#{|}' + string.digits
73
+ }
74
+ text = text.translate(replacement_map)
75
+
76
+ # make multiple space one space
77
+ text = re.sub(r'\s+', ' ', text)
78
+
79
+ # strip the text
80
+ text = text.strip()
81
+
82
+ return text
83
+
84
+
85
  @st.cache_data
86
  def language_names(json_path):
87
  with open(json_path, 'r') as json_file:
 
195
  probs = []
196
  labels = []
197
 
198
+ sentences = [preprocess_text(sent) for sent in sentences]
199
+
200
  for index, sent in enumerate(sentences):
201
 
202
  output = model_choice.predict(sent)
 
263
  clicked = st.button("Submit")
264
 
265
  if sent:
 
266
 
267
  probs, labels = compute([sent], version=version)
268
  prob = probs[0]