rockerritesh commited on
Commit
76f42d9
·
verified ·
1 Parent(s): 48b1200

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import io
4
+ import os
5
+
6
+
7
+ unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
+ unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
9
+ unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
10
+ symbolsDict = {
11
+ "~": "ञ्",
12
+ "`": "ञ",
13
+ "!": "१",
14
+ "@": "२",
15
+ "#": "३",
16
+ "$": "४",
17
+ "%": "५",
18
+ "^": "६",
19
+ "&": "७",
20
+ "*": "८",
21
+ "(": "९",
22
+ ")": "०",
23
+ "-": "(",
24
+ "_": ")",
25
+ "+": "ं",
26
+ "[": "ृ",
27
+ "{": "र्",
28
+ "]": "े",
29
+ "}": "ै",
30
+ "\\": "्",
31
+ "|": "्र",
32
+ ";": "स",
33
+ ":": "स्",
34
+ "'": "ु",
35
+ "\"": "ू",
36
+ ",": ",",
37
+ "<": "?",
38
+ ".": "।",
39
+ ">": "श्र",
40
+ "/": "र",
41
+ "?": "रु",
42
+ "=": ".",
43
+ "ˆ": "फ्",
44
+ "Î": "ङ्ख",
45
+ "å": "द्व",
46
+ "÷": "/"
47
+ }
48
+
49
+ def normalizePreeti(preetitxt):
50
+ normalized = ''
51
+ previoussymbol = ''
52
+ preetitxt = preetitxt.replace('qm', 's|')
53
+ preetitxt = preetitxt.replace('f]', 'ो')
54
+ preetitxt = preetitxt.replace('km', 'फ')
55
+ preetitxt = preetitxt.replace('0f', 'ण')
56
+ preetitxt = preetitxt.replace('If', 'क्ष')
57
+ preetitxt = preetitxt.replace('if', 'ष')
58
+ preetitxt = preetitxt.replace('cf', 'आ')
59
+ index = -1
60
+ while index + 1 < len(preetitxt):
61
+ index += 1
62
+ character = preetitxt[index]
63
+ try:
64
+ if preetitxt[index + 2] == '{':
65
+ if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
66
+ normalized += '{' + character + preetitxt[index + 1]
67
+ index += 2
68
+ continue
69
+ if preetitxt[index + 1] == '{':
70
+ if character != 'f':
71
+ normalized += '{' + character
72
+ index += 1
73
+ continue
74
+ except IndexError:
75
+ pass
76
+ if character == 'l':
77
+ previoussymbol = 'l'
78
+ continue
79
+ else:
80
+ normalized += character + previoussymbol
81
+ previoussymbol = ''
82
+ return normalized
83
+
84
+ def convert(preeti):
85
+ converted = ''
86
+ normalizedpreeti = normalizePreeti(preeti)
87
+ for index, character in enumerate(normalizedpreeti):
88
+ try:
89
+ if ord(character) >= 97 and ord(character) <= 122:
90
+ converted += unicodeatoz[ord(character) - 97]
91
+ elif ord(character) >= 65 and ord(character) <= 90:
92
+ converted += unicodeAtoZ[ord(character) - 65]
93
+ elif ord(character) >= 48 and ord(character) <= 57:
94
+ converted += unicode0to9[ord(character) - 48]
95
+ else:
96
+ converted += symbolsDict[character]
97
+ except KeyError:
98
+ converted += character
99
+
100
+ return converted
101
+
102
+ def extract_text_from_pdf(pdf_file):
103
+ text = ''
104
+ with open(pdf_file, 'rb') as file:
105
+ reader = PyPDF2.PdfReader(file)
106
+ for page in reader.pages:
107
+ text += page.extract_text()
108
+ return text
109
+
110
+ def process_file(inputfile):
111
+ ext = os.path.splitext(inputfile)[1].lower()
112
+ if ext == '.pdf':
113
+ preeti = extract_text_from_pdf(inputfile)
114
+ else:
115
+ with open(inputfile, "r") as fp:
116
+ preeti = fp.read()
117
+ return convert(preeti)
118
+
119
+ def main():
120
+ st.title("PDF/TXT to Unicode Converter(Nepali RAG)")
121
+
122
+ uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
123
+
124
+ if uploaded_file is not None:
125
+ file_extension = os.path.splitext(uploaded_file.name)[1].lower()
126
+
127
+ if file_extension == ".pdf":
128
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
129
+ text = ""
130
+ for page in pdf_reader.pages:
131
+ text += page.extract_text()
132
+ else: # .txt file
133
+ text = uploaded_file.getvalue().decode("utf-8")
134
+
135
+ converted_text = convert(text)
136
+
137
+ st.subheader("Original Text")
138
+ st.text_area("", value=text, height=200)
139
+
140
+ st.subheader("Converted Text")
141
+ st.text_area("", value=converted_text, height=200)
142
+
143
+ # Create a download button for the converted text
144
+ st.download_button(
145
+ label="Download Converted Text",
146
+ data=converted_text.encode("utf-8"),
147
+ file_name="converted_text.txt",
148
+ mime="text/plain"
149
+ )
150
+
151
+ # Write footer
152
+ st.markdown("Made with ❤️ by Amnil@[Sumit Yadav](https://sumityadav.com.np)")
153
+
154
+ if __name__ == "__main__":
155
+ main()