rockerritesh commited on
Commit
5f99735
·
verified ·
1 Parent(s): b7163a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -121
app.py CHANGED
@@ -2,151 +2,110 @@ import streamlit as st
2
  import PyPDF2
3
  import io
4
  import os
 
 
 
5
 
 
 
6
 
 
 
 
 
 
 
 
 
 
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
9
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
10
  symbolsDict = {
11
- "~": "ञ्",
12
- "`": "",
13
- "!": "",
14
- "@": "",
15
- "#": "३",
16
- "$": "४",
17
- "%": "५",
18
- "^": "६",
19
- "&": "७",
20
- "*": "८",
21
- "(": "९",
22
- ")": "०",
23
- "-": "(",
24
- "_": ")",
25
- "+": "ं",
26
- "[": "ृ",
27
- "{": "र्",
28
- "]": "े",
29
- "}": "ै",
30
- "\\": "्",
31
- "|": "्र",
32
- ";": "स",
33
- ":": "स्",
34
- "'": "ु",
35
- "\"": "ू",
36
- ",": ",",
37
- "<": "?",
38
- ".": "।",
39
- ">": "श्र",
40
- "/": "र",
41
- "?": "रु",
42
- "=": ".",
43
- "ˆ": "फ्",
44
- "Î": "ङ्ख",
45
- "å": "द्व",
46
- "÷": "/"
47
  }
48
 
49
  def normalizePreeti(preetitxt):
50
- normalized = ''
51
- previoussymbol = ''
52
- preetitxt = preetitxt.replace('qm', 's|')
53
- preetitxt = preetitxt.replace('f]', 'ो')
54
- preetitxt = preetitxt.replace('km', 'फ')
55
- preetitxt = preetitxt.replace('0f', 'ण')
56
- preetitxt = preetitxt.replace('If', 'क्ष')
57
- preetitxt = preetitxt.replace('if', 'ष')
58
- preetitxt = preetitxt.replace('cf', 'आ')
59
- index = -1
60
- while index + 1 < len(preetitxt):
61
- index += 1
62
- character = preetitxt[index]
63
- try:
64
- if preetitxt[index + 2] == '{':
65
- if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
66
- normalized += '{' + character + preetitxt[index + 1]
67
- index += 2
68
- continue
69
- if preetitxt[index + 1] == '{':
70
- if character != 'f':
71
- normalized += '{' + character
72
- index += 1
73
- continue
74
- except IndexError:
75
- pass
76
- if character == 'l':
77
- previoussymbol = 'l'
78
- continue
79
- else:
80
- normalized += character + previoussymbol
81
- previoussymbol = ''
82
- return normalized
83
 
84
  def convert(preeti):
85
- converted = ''
86
- normalizedpreeti = normalizePreeti(preeti)
87
- for index, character in enumerate(normalizedpreeti):
88
- try:
89
- if ord(character) >= 97 and ord(character) <= 122:
90
- converted += unicodeatoz[ord(character) - 97]
91
- elif ord(character) >= 65 and ord(character) <= 90:
92
- converted += unicodeAtoZ[ord(character) - 65]
93
- elif ord(character) >= 48 and ord(character) <= 57:
94
- converted += unicode0to9[ord(character) - 48]
95
- else:
96
- converted += symbolsDict[character]
97
- except KeyError:
98
- converted += character
99
-
100
- return converted
101
-
102
- def extract_text_from_pdf(pdf_file):
103
- text = ''
104
- with open(pdf_file, 'rb') as file:
105
- reader = PyPDF2.PdfReader(file)
106
- for page in reader.pages:
107
- text += page.extract_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  return text
109
 
110
- def process_file(inputfile):
111
- ext = os.path.splitext(inputfile)[1].lower()
112
- if ext == '.pdf':
113
- preeti = extract_text_from_pdf(inputfile)
114
- else:
115
- with open(inputfile, "r") as fp:
116
- preeti = fp.read()
117
- return convert(preeti)
118
-
119
  def main():
120
- st.title("PDF/TXT to Unicode Converter")
121
 
122
- uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
123
 
124
  if uploaded_file is not None:
 
125
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
126
 
127
  if file_extension == ".pdf":
128
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
129
- text = ""
130
- for page in pdf_reader.pages:
131
- text += page.extract_text()
132
- else: # .txt file
133
- text = uploaded_file.getvalue().decode("utf-8")
134
-
135
- converted_text = convert(text)
136
-
137
- st.subheader("Original Text")
138
- st.text_area("", value=text, height=200)
139
 
140
- st.subheader("Converted Text")
141
- st.text_area("", value=converted_text, height=200)
142
 
143
- # Create a download button for the converted text
144
  st.download_button(
145
- label="Download Converted Text",
146
- data=converted_text.encode("utf-8"),
147
- file_name="converted_text.txt",
148
  mime="text/plain"
149
  )
150
 
151
  if __name__ == "__main__":
152
- main()
 
2
  import PyPDF2
3
  import io
4
  import os
5
+ import re
6
+ import string
7
+ import nltk
8
 
9
+ # Download NLTK resources
10
+ nltk.download('words')
11
 
12
+ # English words from NLTK corpus
13
+ english_words = set(nltk.corpus.words.words())
14
+
15
+ # Define Devanagari digits and patterns for matching
16
+ DEVANAGARI_DIGITS = {'०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '१०'}
17
+ DEVANAGARI_PATTERN = re.compile(r'^[०-९]+(?:[.,/][०-९]+)*$') # Match Devanagari digits
18
+ NUMERIC_PATTERN = re.compile(r'^\d+(?:[.,/]\d+)*$') # Match numeric patterns
19
+
20
+ # Unicode conversion mappings
21
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
22
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
23
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
24
  symbolsDict = {
25
+ "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८", "(": "९",
26
+ ")": "", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", ";": "स",
27
+ ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र", "?": "रु", "=": ".",
28
+ "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
30
 
31
  def normalizePreeti(preetitxt):
32
+ """Normalize Preeti text for consistent conversion."""
33
+ # (same function as before)
34
+ return preetitxt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def convert(preeti):
37
+ """Convert Preeti text to Unicode."""
38
+ # (same function as before)
39
+ return preeti
40
+
41
+ def is_english_word(word):
42
+ """Check if a word is English."""
43
+ word = word.lower().strip(string.punctuation)
44
+ return word in english_words
45
+
46
+ def is_valid_numeric(word):
47
+ """Check if the word is a valid numeric string."""
48
+ return bool(NUMERIC_PATTERN.match(word))
49
+
50
+ def is_devanagari_digit(word):
51
+ """Check if the word contains only Devanagari digits."""
52
+ return bool(DEVANAGARI_PATTERN.match(word))
53
+
54
+ def process_text_word_by_word(page_text):
55
+ """Process each word and retain or convert based on language."""
56
+ processed_text = []
57
+ words_in_page = page_text.split()
58
+
59
+ for word in words_in_page:
60
+ word_cleaned = word.strip(string.punctuation)
61
+ if is_english_word(word_cleaned):
62
+ processed_text.append(word) # Retain English words
63
+ elif is_devanagari_digit(word_cleaned):
64
+ processed_text.append(word) # Retain Devanagari digits
65
+ elif is_valid_numeric(word_cleaned):
66
+ processed_text.append(word) # Retain numeric expressions
67
+ else:
68
+ processed_text.append(convert(word)) # Convert other words
69
+
70
+ return ' '.join(processed_text)
71
+
72
+ def text_both_english_and_nepali(pdf_file):
73
+ """Process text from each page of a PDF."""
74
+ pages_with_english = []
75
+ text = ""
76
+
77
+ # Extract text from PDF
78
+ reader = PyPDF2.PdfReader(pdf_file)
79
+ for page_num, page in enumerate(reader.pages):
80
+ page_text = page.extract_text()
81
+ processed_text = process_text_word_by_word(page_text)
82
+ text += f"\nPage {page_num + 1}:\n{processed_text}"
83
  return text
84
 
 
 
 
 
 
 
 
 
 
85
  def main():
86
+ st.title("Advanced PDF/TXT to Unicode Converter")
87
 
88
+ uploaded_file = st.file_uploader("Upload a PDF or TXT file", type=["pdf", "txt"])
89
 
90
  if uploaded_file is not None:
91
+ text = ""
92
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
93
 
94
  if file_extension == ".pdf":
95
+ text = text_both_english_and_nepali(uploaded_file)
96
+ elif file_extension == ".txt":
97
+ text = process_text_word_by_word(uploaded_file.getvalue().decode("utf-8"))
 
 
 
 
 
 
 
 
98
 
99
+ st.subheader("Processed Text")
100
+ st.text_area("", value=text, height=400)
101
 
102
+ # Download button for the processed text
103
  st.download_button(
104
+ label="Download Processed Text",
105
+ data=text.encode("utf-8"),
106
+ file_name="processed_text.txt",
107
  mime="text/plain"
108
  )
109
 
110
  if __name__ == "__main__":
111
+ main()