rockerritesh commited on
Commit
66882a0
·
verified ·
1 Parent(s): 113d52a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -37
app.py CHANGED
@@ -1,22 +1,61 @@
1
  import streamlit as st
 
2
  import io
3
  import os
4
- import pdfplumber
5
 
6
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
7
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
8
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
9
  symbolsDict = {
10
- "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८",
11
- "(": "", ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्",
12
- "|": "्र", ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र",
13
- "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  }
15
 
16
  def normalizePreeti(preetitxt):
17
  normalized = ''
18
  previoussymbol = ''
19
- preetitxt = preetitxt.replace('qm', 's|').replace('f]', 'ो').replace('km', 'फ').replace('0f', 'ण').replace('If', 'क्ष').replace('if', 'ष').replace('cf', 'आ')
 
 
 
 
 
 
20
  index = -1
21
  while index + 1 < len(preetitxt):
22
  index += 1
@@ -45,47 +84,40 @@ def normalizePreeti(preetitxt):
45
  def convert(preeti):
46
  converted = ''
47
  normalizedpreeti = normalizePreeti(preeti)
48
- for character in normalizedpreeti:
49
  try:
50
- if 97 <= ord(character) <= 122:
51
  converted += unicodeatoz[ord(character) - 97]
52
- elif 65 <= ord(character) <= 90:
53
  converted += unicodeAtoZ[ord(character) - 65]
54
- elif 48 <= ord(character) <= 57:
55
  converted += unicode0to9[ord(character) - 48]
56
  else:
57
  converted += symbolsDict[character]
58
  except KeyError:
59
  converted += character
 
60
  return converted
61
 
62
  def extract_text_from_pdf(pdf_file):
63
  text = ''
64
- pdf = pdfplumber.open(pdf_file)
65
- for page in pdf.pages:
66
- extracted_text = page.extract_text()
67
- if extracted_text:
68
- text += extracted_text
69
- return handle_vertical_text(text)
70
-
71
- def handle_vertical_text(text):
72
- lines = text.split('\n')
73
- vertical_lines = []
74
- horizontal_line = ''
75
- for line in lines:
76
- if len(line) == 1: # Possible vertical arrangement (single character per line)
77
- horizontal_line += line
78
- else:
79
- if horizontal_line: # If we've built a horizontal line, add it.
80
- vertical_lines.append(horizontal_line)
81
- horizontal_line = ''
82
- vertical_lines.append(line) # Add the full line if it's not vertical.
83
- if horizontal_line:
84
- vertical_lines.append(horizontal_line)
85
- return ' '.join(vertical_lines)
86
 
87
  def main():
88
- st.title("PDF/TXT to Unicode Converter (Nepali RAG)")
89
 
90
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
91
 
@@ -93,7 +125,10 @@ def main():
93
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
94
 
95
  if file_extension == ".pdf":
96
- text = extract_text_from_pdf(io.BytesIO(uploaded_file.read()))
 
 
 
97
  else: # .txt file
98
  text = uploaded_file.getvalue().decode("utf-8")
99
 
@@ -113,8 +148,5 @@ def main():
113
  mime="text/plain"
114
  )
115
 
116
- # Write footer
117
- st.markdown("Made with ❤️ by Sumit Yadav(https://sumityadav.com.np)")
118
-
119
  if __name__ == "__main__":
120
  main()
 
1
  import streamlit as st
2
+ import PyPDF2
3
  import io
4
  import os
5
+
6
 
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
9
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
10
  symbolsDict = {
11
+ "~": "ञ्",
12
+ "`": "",
13
+ "!": "",
14
+ "@": "",
15
+ "#": "३",
16
+ "$": "४",
17
+ "%": "५",
18
+ "^": "६",
19
+ "&": "७",
20
+ "*": "८",
21
+ "(": "९",
22
+ ")": "०",
23
+ "-": "(",
24
+ "_": ")",
25
+ "+": "ं",
26
+ "[": "ृ",
27
+ "{": "र्",
28
+ "]": "े",
29
+ "}": "ै",
30
+ "\\": "्",
31
+ "|": "्र",
32
+ ";": "स",
33
+ ":": "स्",
34
+ "'": "ु",
35
+ "\"": "ू",
36
+ ",": ",",
37
+ "<": "?",
38
+ ".": "।",
39
+ ">": "श्र",
40
+ "/": "र",
41
+ "?": "रु",
42
+ "=": ".",
43
+ "ˆ": "फ्",
44
+ "Î": "ङ्ख",
45
+ "å": "द्व",
46
+ "÷": "/"
47
  }
48
 
49
  def normalizePreeti(preetitxt):
50
  normalized = ''
51
  previoussymbol = ''
52
+ preetitxt = preetitxt.replace('qm', 's|')
53
+ preetitxt = preetitxt.replace('f]', 'ो')
54
+ preetitxt = preetitxt.replace('km', 'फ')
55
+ preetitxt = preetitxt.replace('0f', 'ण')
56
+ preetitxt = preetitxt.replace('If', 'क्ष')
57
+ preetitxt = preetitxt.replace('if', 'ष')
58
+ preetitxt = preetitxt.replace('cf', 'आ')
59
  index = -1
60
  while index + 1 < len(preetitxt):
61
  index += 1
 
84
  def convert(preeti):
85
  converted = ''
86
  normalizedpreeti = normalizePreeti(preeti)
87
+ for index, character in enumerate(normalizedpreeti):
88
  try:
89
+ if ord(character) >= 97 and ord(character) <= 122:
90
  converted += unicodeatoz[ord(character) - 97]
91
+ elif ord(character) >= 65 and ord(character) <= 90:
92
  converted += unicodeAtoZ[ord(character) - 65]
93
+ elif ord(character) >= 48 and ord(character) <= 57:
94
  converted += unicode0to9[ord(character) - 48]
95
  else:
96
  converted += symbolsDict[character]
97
  except KeyError:
98
  converted += character
99
+
100
  return converted
101
 
102
  def extract_text_from_pdf(pdf_file):
103
  text = ''
104
+ with open(pdf_file, 'rb') as file:
105
+ reader = PyPDF2.PdfReader(file)
106
+ for page in reader.pages:
107
+ text += page.extract_text()
108
+ return text
109
+
110
+ def process_file(inputfile):
111
+ ext = os.path.splitext(inputfile)[1].lower()
112
+ if ext == '.pdf':
113
+ preeti = extract_text_from_pdf(inputfile)
114
+ else:
115
+ with open(inputfile, "r") as fp:
116
+ preeti = fp.read()
117
+ return convert(preeti)
 
 
 
 
 
 
 
 
118
 
119
  def main():
120
+ st.title("PDF/TXT to Unicode Converter")
121
 
122
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
123
 
 
125
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
126
 
127
  if file_extension == ".pdf":
128
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
129
+ text = ""
130
+ for page in pdf_reader.pages:
131
+ text += page.extract_text()
132
  else: # .txt file
133
  text = uploaded_file.getvalue().decode("utf-8")
134
 
 
148
  mime="text/plain"
149
  )
150
 
 
 
 
151
  if __name__ == "__main__":
152
  main()