rockerritesh commited on
Commit
93294e9
·
verified ·
1 Parent(s): 6a89f1b

better way to handle english

Browse files
Files changed (1) hide show
  1. app.py +151 -96
app.py CHANGED
@@ -2,151 +2,206 @@ import streamlit as st
2
  import PyPDF2
3
  import io
4
  import os
 
5
 
6
-
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
9
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
10
  symbolsDict = {
11
- "~": "ञ्",
12
- "`": "",
13
- "!": "",
14
- "@": "",
15
- "#": "",
16
- "$": "",
17
- "%": "५",
18
- "^": "६",
19
- "&": "७",
20
- "*": "८",
21
- "(": "९",
22
- ")": "०",
23
- "-": "(",
24
- "_": ")",
25
- "+": "ं",
26
- "[": "ृ",
27
- "{": "र्",
28
- "]": "े",
29
- "}": "ै",
30
- "\\": "्",
31
- "|": "्र",
32
- ";": "स",
33
- ":": "स्",
34
- "'": "ु",
35
- "\"": "ू",
36
- ",": ",",
37
- "<": "?",
38
- ".": "।",
39
- ">": "श्र",
40
- "/": "र",
41
- "?": "रु",
42
- "=": ".",
43
- "ˆ": "फ्",
44
- "Î": "ङ्ख",
45
- "å": "द्व",
46
- "÷": "/"
47
  }
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def normalizePreeti(preetitxt):
 
50
  normalized = ''
51
  previoussymbol = ''
52
- preetitxt = preetitxt.replace('qm', 's|')
53
- preetitxt = preetitxt.replace('f]', 'ो')
54
- preetitxt = preetitxt.replace('km', 'फ')
55
- preetitxt = preetitxt.replace('0f', '')
56
- preetitxt = preetitxt.replace('If', 'क्ष')
57
- preetitxt = preetitxt.replace('if', '')
58
- preetitxt = preetitxt.replace('cf', '')
 
 
 
 
 
 
 
 
59
  index = -1
60
  while index + 1 < len(preetitxt):
61
  index += 1
62
  character = preetitxt[index]
 
63
  try:
64
- if preetitxt[index + 2] == '{':
65
  if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
66
  normalized += '{' + character + preetitxt[index + 1]
67
  index += 2
68
  continue
69
- if preetitxt[index + 1] == '{':
 
70
  if character != 'f':
71
  normalized += '{' + character
72
  index += 1
73
  continue
74
  except IndexError:
75
  pass
 
76
  if character == 'l':
77
  previoussymbol = 'l'
78
  continue
79
  else:
80
  normalized += character + previoussymbol
81
  previoussymbol = ''
 
82
  return normalized
83
 
84
- def convert(preeti):
 
85
  converted = ''
86
  normalizedpreeti = normalizePreeti(preeti)
87
- for index, character in enumerate(normalizedpreeti):
 
88
  try:
89
- if ord(character) >= 97 and ord(character) <= 122:
90
- converted += unicodeatoz[ord(character) - 97]
91
- elif ord(character) >= 65 and ord(character) <= 90:
92
- converted += unicodeAtoZ[ord(character) - 65]
93
- elif ord(character) >= 48 and ord(character) <= 57:
94
- converted += unicode0to9[ord(character) - 48]
95
  else:
96
- converted += symbolsDict[character]
97
- except KeyError:
98
  converted += character
99
-
100
  return converted
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def extract_text_from_pdf(pdf_file):
 
103
  text = ''
104
- with open(pdf_file, 'rb') as file:
105
- reader = PyPDF2.PdfReader(file)
106
- for page in reader.pages:
107
- text += page.extract_text()
 
 
 
 
108
  return text
109
 
110
- def process_file(inputfile):
111
- ext = os.path.splitext(inputfile)[1].lower()
112
- if ext == '.pdf':
113
- preeti = extract_text_from_pdf(inputfile)
114
- else:
115
- with open(inputfile, "r") as fp:
116
- preeti = fp.read()
117
- return convert(preeti)
118
-
119
  def main():
120
- st.title("PDF/TXT to Unicode Converter")
 
121
 
122
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
123
 
124
  if uploaded_file is not None:
125
- file_extension = os.path.splitext(uploaded_file.name)[1].lower()
126
-
127
- if file_extension == ".pdf":
128
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
129
- text = ""
130
- for page in pdf_reader.pages:
131
- text += page.extract_text()
132
- else: # .txt file
133
- text = uploaded_file.getvalue().decode("utf-8")
134
-
135
- converted_text = convert(text)
136
-
137
- st.subheader("Original Text")
138
- st.text_area("", value=text, height=200)
139
-
140
- st.subheader("Converted Text")
141
- st.text_area("", value=converted_text, height=200)
142
-
143
- # Create a download button for the converted text
144
- st.download_button(
145
- label="Download Converted Text",
146
- data=converted_text.encode("utf-8"),
147
- file_name="converted_text.txt",
148
- mime="text/plain"
149
- )
 
 
 
 
 
150
 
151
  if __name__ == "__main__":
152
  main()
 
2
  import PyPDF2
3
  import io
4
  import os
5
+ import re
6
 
7
+ # Existing mapping dictionaries
8
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
9
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
10
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
11
  symbolsDict = {
12
+ "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
13
+ "^": "", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
14
+ "+": "", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
15
+ ";": "", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
16
+ ">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
17
+ "å": "द्व", "÷": "/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
 
20
+ def is_preeti_text(text):
21
+ """
22
+ Check if text segment is likely to be Preeti-encoded Nepali.
23
+ Returns True if the text contains common Preeti patterns.
24
+ """
25
+ preeti_patterns = [
26
+ r'cf', r'qm', r'If', r'0f', r'km', r'f]', # Common Preeti combinations
27
+ r'[a-zA-Z]{2,}[\\|\[\]{}]', # Preeti vowel signs and consonants
28
+ ]
29
+
30
+ return any(re.search(pattern, text) for pattern in preeti_patterns)
31
+
32
  def normalizePreeti(preetitxt):
33
+ """Normalized Preeti text with improved handling"""
34
  normalized = ''
35
  previoussymbol = ''
36
+
37
+ # Common Preeti substitutions
38
+ replacements = {
39
+ 'qm': 's|',
40
+ 'f]': '',
41
+ 'km': '',
42
+ '0f': '',
43
+ 'If': 'क्ष',
44
+ 'if': 'ष',
45
+ 'cf': 'आ'
46
+ }
47
+
48
+ for old, new in replacements.items():
49
+ preetitxt = preetitxt.replace(old, new)
50
+
51
  index = -1
52
  while index + 1 < len(preetitxt):
53
  index += 1
54
  character = preetitxt[index]
55
+
56
  try:
57
+ if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{':
58
  if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
59
  normalized += '{' + character + preetitxt[index + 1]
60
  index += 2
61
  continue
62
+
63
+ if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{':
64
  if character != 'f':
65
  normalized += '{' + character
66
  index += 1
67
  continue
68
  except IndexError:
69
  pass
70
+
71
  if character == 'l':
72
  previoussymbol = 'l'
73
  continue
74
  else:
75
  normalized += character + previoussymbol
76
  previoussymbol = ''
77
+
78
  return normalized
79
 
80
+ def convert_preeti_segment(preeti):
81
+ """Convert a single Preeti segment to Unicode"""
82
  converted = ''
83
  normalizedpreeti = normalizePreeti(preeti)
84
+
85
+ for character in normalizedpreeti:
86
  try:
87
+ if ord('a') <= ord(character) <= ord('z'):
88
+ converted += unicodeatoz[ord(character) - ord('a')]
89
+ elif ord('A') <= ord(character) <= ord('Z'):
90
+ converted += unicodeAtoZ[ord(character) - ord('A')]
91
+ elif ord('0') <= ord(character) <= ord('9'):
92
+ converted += unicode0to9[ord(character) - ord('0')]
93
  else:
94
+ converted += symbolsDict.get(character, character)
95
+ except (KeyError, IndexError):
96
  converted += character
97
+
98
  return converted
99
 
100
+ def smart_convert(text):
101
+ """
102
+ Convert text while preserving English segments.
103
+ Uses pattern matching to identify and preserve English text.
104
+ """
105
+ # Patterns to identify different text segments
106
+ patterns = [
107
+ # Email addresses
108
+ r'\b[\w\.-]+@[\w\.-]+\.\w+\b',
109
+ # URLs
110
+ r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
111
+ # Date patterns
112
+ r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b',
113
+ # Common English words (3 or more characters)
114
+ r'\b[A-Za-z]{3,}\b',
115
+ # Numbers with units
116
+ r'\b\d+\s*[A-Za-z]+\b',
117
+ ]
118
+
119
+ # Combine patterns
120
+ combined_pattern = '|'.join(patterns)
121
+
122
+ # Split text into segments while preserving delimiters
123
+ segments = []
124
+ last_end = 0
125
+
126
+ for match in re.finditer(combined_pattern, text):
127
+ start, end = match.span()
128
+
129
+ # Add text before match
130
+ if start > last_end:
131
+ segment = text[last_end:start]
132
+ if segment.strip():
133
+ segments.append((segment, is_preeti_text(segment)))
134
+
135
+ # Add matched text (preserve it)
136
+ segments.append((match.group(), False))
137
+ last_end = end
138
+
139
+ # Add remaining text
140
+ if last_end < len(text):
141
+ segment = text[last_end:]
142
+ if segment.strip():
143
+ segments.append((segment, is_preeti_text(segment)))
144
+
145
+ # Convert segments
146
+ result = ''
147
+ for segment, is_preeti in segments:
148
+ if is_preeti:
149
+ result += convert_preeti_segment(segment)
150
+ else:
151
+ result += segment
152
+
153
+ return result
154
+
155
  def extract_text_from_pdf(pdf_file):
156
+ """Extract text from PDF with improved encoding handling"""
157
  text = ''
158
+ try:
159
+ with open(pdf_file, 'rb') as file:
160
+ reader = PyPDF2.PdfReader(file)
161
+ for page in reader.pages:
162
+ text += page.extract_text() or ''
163
+ except Exception as e:
164
+ st.error(f"Error reading PDF: {str(e)}")
165
+ return ''
166
  return text
167
 
 
 
 
 
 
 
 
 
 
168
  def main():
169
+ st.title("Smart Preeti to Unicode Converter")
170
+ st.write("This converter preserves English text while converting Preeti to Unicode")
171
 
172
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
173
 
174
  if uploaded_file is not None:
175
+ try:
176
+ if uploaded_file.name.lower().endswith('.pdf'):
177
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
178
+ text = ""
179
+ for page in pdf_reader.pages:
180
+ text += page.extract_text() or ''
181
+ else: # .txt file
182
+ text = uploaded_file.getvalue().decode("utf-8")
183
+
184
+ converted_text = smart_convert(text)
185
+
186
+ col1, col2 = st.columns(2)
187
+
188
+ with col1:
189
+ st.subheader("Original Text")
190
+ st.text_area("", value=text, height=300)
191
+
192
+ with col2:
193
+ st.subheader("Converted Text")
194
+ st.text_area("", value=converted_text, height=300)
195
+
196
+ st.download_button(
197
+ label="Download Converted Text",
198
+ data=converted_text.encode("utf-8"),
199
+ file_name="converted_text.txt",
200
+ mime="text/plain"
201
+ )
202
+
203
+ except Exception as e:
204
+ st.error(f"An error occurred: {str(e)}")
205
 
206
  if __name__ == "__main__":
207
  main()