sflindrs commited on
Commit
8fbe1d5
·
verified ·
1 Parent(s): 4bf5dae
Files changed (2) hide show
  1. app.py +94 -1
  2. requirements.txt +3 -1
app.py CHANGED
@@ -4,6 +4,9 @@ from PIL import Image
4
  import torch
5
  import spaces
6
  import json
 
 
 
7
 
8
  # Load the processor and model
9
  processor = AutoProcessor.from_pretrained(
@@ -69,6 +72,95 @@ def wrap_json_in_markdown(text):
69
  result.append(text) # Append any remaining text
70
  return ''.join(result)
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  @spaces.GPU()
73
  def process_image_and_text(image, text):
74
  # Process the image and text
@@ -91,8 +183,9 @@ def process_image_and_text(image, text):
91
  generated_tokens = output[0, inputs['input_ids'].size(1):]
92
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
93
  generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
 
94
 
95
- return generated_text_w_json_wrapper
96
 
97
  def chatbot(image, text, history):
98
  if image is None:
 
4
  import torch
5
  import spaces
6
  import json
7
+ import re
8
+ from langdetect import detect, LangDetectException
9
+ from googletrans import Translator
10
 
11
  # Load the processor and model
12
  processor = AutoProcessor.from_pretrained(
 
72
  result.append(text) # Append any remaining text
73
  return ''.join(result)
74
 
75
+ def decode_unicode_sequences(unicode_seq):
76
+ """
77
+ Decodes a sequence of Unicode escape sequences (e.g., \\u4F60\\u597D) to actual characters.
78
+
79
+ Args:
80
+ unicode_seq (str): A string containing Unicode escape sequences.
81
+
82
+ Returns:
83
+ str: The decoded Unicode string.
84
+ """
85
+ # Regular expression to find \uXXXX
86
+ unicode_escape_pattern = re.compile(r'\\u([0-9a-fA-F]{4})')
87
+
88
+ # Function to replace each \uXXXX with the corresponding character
89
+ def replace_match(match):
90
+ hex_value = match.group(1)
91
+ return chr(int(hex_value, 16))
92
+
93
+ # Decode all \uXXXX sequences
94
+ decoded = unicode_escape_pattern.sub(replace_match, unicode_seq)
95
+ return decoded
96
+
97
+ def is_mandarin(text):
98
+ """
99
+ Detects if the given text is in Mandarin.
100
+
101
+ Args:
102
+ text (str): The text to check.
103
+
104
+ Returns:
105
+ bool: True if the text is detected as Mandarin, False otherwise.
106
+ """
107
+ try:
108
+ lang = detect(text)
109
+ return lang == 'zh-cn' or lang == 'zh-tw' or lang == 'zh'
110
+ except LangDetectException:
111
+ return False
112
+
113
+ def translate_to_english(text, translator):
114
+ """
115
+ Translates the given Mandarin text to English.
116
+
117
+ Args:
118
+ text (str): The Mandarin text to translate.
119
+ translator (Translator): An instance of googletrans Translator.
120
+
121
+ Returns:
122
+ str: The translated English text.
123
+ """
124
+ try:
125
+ translation = translator.translate(text, src='zh-cn', dest='en')
126
+ return translation.text
127
+ except Exception as e:
128
+ print(f"Translation error: {e}")
129
+ return text # Return the original text if translation fails
130
+
131
+ def process_text_for_mandarin_unicode(input_string):
132
+ """
133
+ Processes the input string to find Unicode escape sequences representing Mandarin words,
134
+ translates them to English, and replaces them accordingly.
135
+
136
+ Args:
137
+ input_string (str): The original string containing Unicode escape sequences.
138
+
139
+ Returns:
140
+ str: The processed string with translations where applicable.
141
+ """
142
+ # Initialize the translator
143
+ translator = Translator()
144
+
145
+ # Regular expression to find groups of consecutive \uXXXX sequences
146
+ unicode_word_pattern = re.compile(r'(?:\\u[0-9a-fA-F]{4})+')
147
+
148
+ # Function to process each matched Unicode word
149
+ def process_match(match):
150
+ unicode_seq = match.group(0)
151
+ decoded_word = decode_unicode_sequences(unicode_seq)
152
+
153
+ if is_mandarin(decoded_word):
154
+ translated = translate_to_english(decoded_word, translator)
155
+ return f"{translated} ({decoded_word})"
156
+ else:
157
+ # If not Mandarin, return the original sequence
158
+ return unicode_seq
159
+
160
+ # Substitute all matched Unicode words with their translations if applicable
161
+ processed_string = unicode_word_pattern.sub(process_match, input_string)
162
+ return processed_string
163
+
164
  @spaces.GPU()
165
  def process_image_and_text(image, text):
166
  # Process the image and text
 
183
  generated_tokens = output[0, inputs['input_ids'].size(1):]
184
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
185
  generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
186
+ generated_text_w_unicode_mdn = process_text_for_mandarin_unicode(generated_text_w_json_wrapper)
187
 
188
+ return generated_text_w_unicode_mdn
189
 
190
  def chatbot(image, text, history):
191
  if image is None:
requirements.txt CHANGED
@@ -4,4 +4,6 @@ Pillow
4
  torchvision
5
  einops
6
  accelerate
7
- tensorflow
 
 
 
4
  torchvision
5
  einops
6
  accelerate
7
+ tensorflow
8
+ langdetect
9
+ googletrans