Spaces:

rockerritesh
/

preeti-unicode

Running

File size: 7,876 Bytes

import streamlit as st
import PyPDF2
import io
import os
import re
import string
import nltk

# # Download NLTK resources
# nltk.download('words')

# # English words from NLTK corpus
# english_words = set(nltk.corpus.words.words())

# with open("index.dic") as f:
#     hunspell_words = {line.split("/")[0].strip() for line in f if not line.startswith("#")}

# def is_english_word(word):
#     return word.lower() in hunspell_words

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import words, wordnet
import spacy
from spellchecker import SpellChecker
import string

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('words')

# Initialize tools
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
english_words = set(words.words())
nlp = spacy.load("en_core_web_sm")  # SpaCy language model
spell = SpellChecker()  # Spell checker

import en_core_web_sm
nlp = en_core_web_sm.load()

# Combine dictionaries for better coverage
combined_dictionary = english_words.union(spell.word_frequency.keys())

def is_english_word(word):
    """
    Checks if a word is English and returns the valid English word or None if not recognized.
    """
    # Preprocess the word: strip punctuation and lowercase
    word_cleaned = word.lower().strip(string.punctuation)
    if not word_cleaned:
        return None

    # 1. Direct dictionary match
    if word_cleaned in combined_dictionary:
        return word_cleaned

    # 2. Lemmatization
    lemma = lemmatizer.lemmatize(word_cleaned)
    if lemma in combined_dictionary:
        return lemma

    # 3. Stemming
    stem = stemmer.stem(word_cleaned)
    if stem in combined_dictionary:
        return stem

    # 4. Spell checker
    corrected_word = spell.correction(word_cleaned)
    if corrected_word in combined_dictionary:
        return corrected_word

    # 5. SpaCy's language model (check if token is recognized as English)
    doc = nlp(word_cleaned)
    if doc and doc[0].is_alpha and doc[0].lang_ == "en":
        return word_cleaned

    return None



# Define Devanagari digits and patterns for matching
DEVANAGARI_DIGITS = {'०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '१०'}
DEVANAGARI_PATTERN = re.compile(r'^[०-९]+(?:[.,/-][०-९]+)*$')  # Match Devanagari digits
NUMERIC_PATTERN = re.compile(r'^\d+(?:[.,/]\d+)*$')  # Match numeric patterns

# Unicode conversion mappings
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
    "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८", "(": "९",
    ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", ";": "स",
    ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र", "?": "रु", "=": ".",
    "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/"
}

def normalizePreeti(preetitxt):
    normalized = ''
    previoussymbol = ''
    preetitxt = preetitxt.replace('qm', 's|')
    preetitxt = preetitxt.replace('f]', 'ो')
    preetitxt = preetitxt.replace('km', 'फ')
    preetitxt = preetitxt.replace('0f', 'ण')
    preetitxt = preetitxt.replace('If', 'क्ष')
    preetitxt = preetitxt.replace('if', 'ष')
    preetitxt = preetitxt.replace('cf', 'आ')
    index = -1
    while index + 1 < len(preetitxt):
        index += 1
        character = preetitxt[index]
        try:
            if preetitxt[index + 2] == '{':
                if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
                    normalized += '{' + character + preetitxt[index + 1]
                    index += 2
                    continue
            if preetitxt[index + 1] == '{':
                if character != 'f':
                    normalized += '{' + character
                    index += 1
                    continue
        except IndexError:
            pass
        if character == 'l':
            previoussymbol = 'l'
            continue
        else:
            normalized += character + previoussymbol
            previoussymbol = ''
    return normalized

def convert(preeti):
    converted = ''
    normalizedpreeti = normalizePreeti(preeti)
    for index, character in enumerate(normalizedpreeti):
        try:
            if ord(character) >= 97 and ord(character) <= 122:
                converted += unicodeatoz[ord(character) - 97]
            elif ord(character) >= 65 and ord(character) <= 90:
                converted += unicodeAtoZ[ord(character) - 65]
            elif ord(character) >= 48 and ord(character) <= 57:
                converted += unicode0to9[ord(character) - 48]
            else:
                converted += symbolsDict[character]
        except KeyError:
            converted += character

    return converted

# def is_english_word(word):
#     """Check if a word is English."""
#     word = word.lower().strip(string.punctuation)
#     return word in english_words

def is_valid_numeric(word):
    """Check if the word is a valid numeric string."""
    return bool(NUMERIC_PATTERN.match(word))

def is_devanagari_digit(word):
    """Check if the word contains only Devanagari digits."""
    return bool(DEVANAGARI_PATTERN.match(word))

def process_text_word_by_word(page_text):
    """Process each word and retain or convert based on language."""
    processed_text = []
    words_in_page = page_text.split()

    for word in words_in_page:
        word_cleaned = word.strip(string.punctuation)
        if is_english_word(word_cleaned):
            processed_text.append(word)  # Retain English words
        elif is_devanagari_digit(word_cleaned):
            processed_text.append(word)  # Retain Devanagari digits
        elif is_valid_numeric(word_cleaned):
            processed_text.append(word)  # Retain numeric expressions
        else:
            processed_text.append(convert(word))  # Convert other words
    
    return ' '.join(processed_text)

def text_both_english_and_nepali(pdf_file):
    """Process text from each page of a PDF."""
    pages_with_english = []
    text = ""

    # Extract text from PDF
    reader = PyPDF2.PdfReader(pdf_file)
    for page_num, page in enumerate(reader.pages):
        page_text = page.extract_text()
        processed_text = process_text_word_by_word(page_text)
        text += f"\nPage {page_num + 1}:\n{processed_text}"
    return text

def main():
    st.title("Advanced PDF/TXT to Unicode Converter")

    uploaded_file = st.file_uploader("Upload a PDF or TXT file", type=["pdf", "txt"])

    if uploaded_file is not None:
        text = ""
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()

        if file_extension == ".pdf":
            text = text_both_english_and_nepali(uploaded_file)
        elif file_extension == ".txt":
            text = process_text_word_by_word(uploaded_file.getvalue().decode("utf-8"))

        st.subheader("Processed Text")
        st.text_area("", value=text, height=400)

        # Download button for the processed text
        st.download_button(
            label="Download Processed Text",
            data=text.encode("utf-8"),
            file_name="processed_text.txt",
            mime="text/plain"
        )

if __name__ == "__main__":
    main()