Spaces:

SalmanML
/

NER_transformer_english

Runtime error

File size: 13,694 Bytes

# from flair.data import Sentence
# from flair.models import SequenceTagger
# import streamlit as st

# # load tagger
# tagger = SequenceTagger.load("flair/ner-english-large")

# # make example sentence
# text=st.text_area("Enter the text to detect it's named entities")
# sentence = Sentence(text)

# # predict NER tags
# tagger.predict(sentence)

# # print sentence
# print(sentence)

# # print predicted NER spans
# print('The following NER tags are found:')
# # iterate over entities and printx
# for entity in sentence.get_spans('ner'):
#     print(entity)



# import easyocr
# import cv2
# import requests
# import re
# from PIL import Image
# import streamlit as st
# # import os


# # Load the EasyOCR reader
# reader = easyocr.Reader(['en'])


# # key=os.environ.getattribute("api_key")
# # print(key)
# API_URL = "https://api-inference.huggingface.co/models/flair/ner-english-large"
# headers = {"Authorization": st.secrets["api_key"]}

# ## Image uploading function ##
# def image_upload_and_ocr(reader):
#     uploaded_file=st.file_uploader(label=':red[**please upload a busines card** :sunglasses:]',type=['jpeg','jpg','png','webp'])
#     if uploaded_file is not None:
#         image=Image.open(uploaded_file)
#         image=image.resize((640,480))
#         result2 = reader.readtext(image)
#         # result2=result
#         texts = [item[1] for item in result2]
#         result=' '.join(texts)
#     return result2,result


# def query(payload):
#     response = requests.post(API_URL, headers=headers, json=payload)
#     return response.json()

# def get_ner_from_transformer(output):
#     data = output
#     named_entities = {}
#     for entity in data:
#         entity_type = entity['entity_group']
#         entity_text = entity['word']
        
#         if entity_type not in named_entities:
#             named_entities[entity_type] = []
        
#         named_entities[entity_type].append(entity_text)
    
#     # for entity_type, entities in named_entities.items():
#         # print(f"{entity_type}: {', '.join(entities)}")
#     return entity_type,named_entities
    
        

        
#     ###  DRAWING DETECTION FUNCTION  ###
# def drawing_detection(image):
#     # Draw bounding boxes around the detected text regions
#     for detection in image:
#         # Extract the bounding box coordinates
#         points = detection[0]  # List of points defining the bounding box
#         x1, y1 = int(points[0][0]), int(points[0][1])  # Top-left corner
#         x2, y2 = int(points[2][0]), int(points[2][1])  # Bottom-right corner
        
#         # Draw the bounding box
#         cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
    
#         # Add the detected text
#         text = detection[1]
#         cv2.putText(image, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
#         st.image(image,caption='Detected text on the card ',width=710)
#     return image



# st.title("_Business_ card data extractor using opencv and streamlit :sunglasses:")
# res2,res=image_upload_and_ocr(reader)
# darwing_image=drawing_detection(res2)

 
# output = query({
#     "inputs": res,
# })

# entity_type,named_entities= get_ner_from_transformer(output)
# st.write(entity_type)
# st.write(named_entities)




import easyocr
import cv2
import requests
import re
from PIL import Image
import streamlit as st
import numpy as np

# Load the EasyOCR reader
reader = easyocr.Reader(['en'])

API_URL = "https://api-inference.huggingface.co/models/flair/ner-english-large"
headers = {"Authorization": st.secrets["api_key"]}

## Image uploading function ##
def image_upload_and_ocr(reader, uploaded_file):
    if uploaded_file is not None:
        image = Image.open(uploaded_file)
        image = image.resize((640, 480))
        
        image_np = np.array(image)  # Convert image to NumPy array
        result2 = reader.readtext(image_np)
        texts = [item[1] for item in result2]
        result = ' '.join(texts)
        
        return result2, result, image
    else:
        return None, None, None

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def get_ner_from_transformer(output):
    data = output
    named_entities = {}
    for entity in data:
        entity_type = entity['entity_group']
        entity_text = entity['word']
        
        if entity_type not in named_entities:
            named_entities[entity_type] = []
        
        named_entities[entity_type].append(entity_text)
    
    return entity_type, named_entities

def drawing_detection(res2, image):
    cv2_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    # Draw bounding boxes around the detected text regions
    for detection in res2:
        # Extract the bounding box coordinates
        points = detection[0]  # List of points defining the bounding box
        x1, y1 = int(points[0][0]), int(points[0][1])  # Top-left corner
        x2, y2 = int(points[2][0]), int(points[2][1])  # Bottom-right corner
        
        # Draw the bounding box
        cv2.rectangle(cv2_image, (x1, y1), (x2, y2), (255, 0, 0), 1)
    
        # Add the detected text
        text = detection[1]
        cv2.putText(cv2_image, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
    
    st.image(cv2_image, caption='Detected text on the card', width=710)
    return cv2_image

# Function to extract phone numbers from text using regular expression
def extract_phone_numbers(text):
    # Regular expression pattern for detecting phone numbers
    PHONE_PATTERN = r'(?:ph|phone|phno)?\s*(?:[+-]?\d\s*[\(\)]*){7,}'

    # Find phone numbers using regular expression
    phone_numbers = re.findall(PHONE_PATTERN, text, re.IGNORECASE)
    # Return the extracted phone numbers
    return phone_numbers or None

# Function to extract email addresses from text using regular expression
def extract_email(text):
    emails = []
    # Regular expression pattern for detecting email addresses with variations
    reg = r'[a-z0-9_.-]+(?:\s*@\s*)[a-z]+(?:\s*\.?\s*[a-z]{2,3})\s*'
    # Find email addresses using regular expression
    res = re.findall(reg, text, re.IGNORECASE)
    # Print the extracted email addresses
    for email in res:
        emails.append(email.strip())
    return emails or None

# Function to extract designations from text using regular expression
def extract_designation(text):
    designations = []
    # Regular expression pattern for detecting designations
    designation_regex = r'\b(?:CEO|CFO|CTO|COO|CMO|CIO|President|Vice\s?President|Director|Manager|Executive\s?Director|Assistant\s?Manager|Account\s?Manager|Sales\s?Manager|Marketing\s?Manager|Product\s?Manager|Project\s?Manager|HR\s?Manager|Human\s?Resources\s?Manager|Operations\s?Manager|Business\s?Development\s?Manager|Senior\s?Manager|General\s?Manager|Team\s?Lead|Consultant|Analyst|Engineer|Architect|Designer|Developer|Programmer|Coordinator|Specialist|Supervisor|Administrator|Assistant|Associate|Partner|Founder|Owner|Principal|Expert|Technician|Officer|Representative|Agent|Accountant|Auditor|Trainer|Coach|Educator|Professor|Instructor|Researcher|Scientist|Doctor|Nurse|Therapist|Pharmacist|Attorney|Lawyer|Legal\s?Counsel|Paralegal|Advocate|Solicitor|Notary|Financial\s?Advisor|Investment\s?Advisor|Wealth\s?Manager|Broker|Realtor|Mortgage\s?Broker|Insurance\s?Agent)\b'

    # Find designations using regular expression
    designations = re.findall(designation_regex, text, re.IGNORECASE)

    return designations or None

# Function to extract website URLs from text using regular expression
def extract_websites(text):
    websites_found=[]
    pattern = r'(https?://)?(www\.)?(\w+)(\.\w+)+'
    websites = re.findall(pattern, text)
    return ["".join(website) for website in websites] or None

# Function to extract PIN codes from text using regular expression
def extract_pin_code(text):
    pin_code_pattern = r'\b\d{6}\b'
    pin_code_match = re.search(pin_code_pattern, text.lower())
    
    # Retrieve the PIN code if found
    if pin_code_match:
        pin_code = pin_code_match.group()
        return pin_code
    else:
        return None

import pandas as pd

# Streamlit UI
st.title("Business Card Data Extractor using OpenCV and Streamlit")

uploaded_file = st.file_uploader(label="Please upload a business card", type=['jpeg', 'jpg', 'png', 'webp'], accept_multiple_files=False)

if uploaded_file is not None:
    res2, res, image = image_upload_and_ocr(reader, uploaded_file)
    
    if res2 is not None:
        drawing_image = drawing_detection(res2, image)

        try:
            output = query({
                "inputs": res,
            })

            entity_type, named_entities = get_ner_from_transformer(output)
        except Exception as e:
            st.error("An error occurred while processing the business card. Please try again later.")
            st.error(f"Error details: {str(e)}")

        extracted_data = {}

        # Function to extract person's name
        # Assuming the person's name is extracted by NER
        names = named_entities.get("PER", [])
        if names:
            selected_name = st.selectbox("Select Person's Name:", [""] + names)
            if selected_name:
                extracted_data["Name"] = selected_name
            else:
                manual_name = st.text_input("Enter Person's Name manually:")
                if manual_name:
                    extracted_data["Name"] = manual_name

        # Function to extract designations
        designations = extract_designation(res)
        if designations is not None:
            selected_designation = st.selectbox("Select Designation:", [""] + designations)
            if selected_designation:
                extracted_data["Designation"] = selected_designation
            else:
                manual_designation = st.text_input("Enter Designation manually:")
                if manual_designation:
                    extracted_data["Designation"] = manual_designation

        # Function to extract company names
        # Assuming the organization names extracted by NER represent company names
        company_names = named_entities.get("ORG", [])
        if company_names:
            selected_company_name = st.selectbox("Select Company Name:", [""] + company_names)
            if selected_company_name:
                extracted_data["Company Name"] = selected_company_name
            else:
                manual_company_name = st.text_input("Enter Company Name manually:")
                if manual_company_name:
                    extracted_data["Company Name"] = manual_company_name

        # Function to extract email addresses
        emails = extract_email(res)
        if emails is not None:
            selected_email = st.selectbox("Select Email:", [""] + emails)
            if selected_email:
                extracted_data["Email"] = selected_email
            else:
                manual_email = st.text_input("Enter Email manually:")
                if manual_email:
                    extracted_data["Email"] = manual_email

        # Function to extract website URLs
        websites = extract_websites(res)
        if websites is not None:
            selected_website = st.selectbox("Select Website:", [""] + websites)
            if selected_website:
                extracted_data["Website"] = selected_website
            else:
                manual_website = st.text_input("Enter Website manually:")
                if manual_website:
                    extracted_data["Website"] = manual_website

        # Function to extract phone numbers
        phone_numbers = extract_phone_numbers(res)
        if phone_numbers is not None:
            selected_phone_number = st.selectbox("Select Phone Number:", [""] + phone_numbers)
            if selected_phone_number:
                extracted_data["Phone Number"] = selected_phone_number
            else:
                manual_phone_number = st.text_input("Enter Phone Number manually:")
                if manual_phone_number:
                    extracted_data["Phone Number"] = manual_phone_number

       # Concatenate all the text returned by the API for location
        locations = named_entities.get("LOC", [])
        if locations:
            concatenated_location = ", ".join(locations)
            selected_location = st.selectbox("Select Location:", [""] + [concatenated_location])
            if selected_location:
                extracted_data["Location"] = selected_location
            else:
                manual_location = st.text_input("Enter Location manually:")
                if manual_location:
                    extracted_data["Location"] = manual_location
        else:
            manual_location = st.text_input("Enter Location manually:")
            if manual_location:
                extracted_data["Location"] = manual_location


        # Function to extract PIN codes
        pin_code = extract_pin_code(res)
        if pin_code is not None:
            selected_pin_code = st.selectbox("Select PIN Code:", ["", pin_code])
            if selected_pin_code:
                extracted_data["PIN Code"] = selected_pin_code
            else:
                manual_pin_code = st.text_input("Enter PIN Code manually:")
                if manual_pin_code:
                    extracted_data["PIN Code"] = manual_pin_code

        # Display extracted data
        if extracted_data:
            st.write("Extracted Data:")
            df = pd.DataFrame([extracted_data], columns=["Name", "Designation", "Company Name", "Email", "Website", "Phone Number", "Location", "PIN Code"])
            st.write(df)