File size: 13,694 Bytes
582a7fc
 
 
854ac8d
582a7fc
 
854ac8d
582a7fc
 
 
854ac8d
582a7fc
 
854ac8d
582a7fc
 
854ac8d
582a7fc
 
 
 
 
 
 
 
351c297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582a7fc
 
 
 
 
74fc7fc
351c297
8a46e51
 
 
 
74fc7fc
2085f26
582a7fc
 
351c297
582a7fc
351c297
 
 
 
 
8a46e51
351c297
 
 
 
 
582a7fc
74fc7fc
 
 
 
002ac57
 
 
 
 
 
 
 
 
 
 
 
351c297
002ac57
351c297
 
582a7fc
351c297
582a7fc
 
 
 
 
 
351c297
582a7fc
 
 
351c297
 
 
 
582a7fc
351c297
 
 
 
8a46e51
351c297
 
 
 
582a7fc
351c297
 
 
 
 
 
 
 
 
 
 
582a7fc
351c297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74fc7fc
351c297
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# from flair.data import Sentence
# from flair.models import SequenceTagger
# import streamlit as st

# # load tagger
# tagger = SequenceTagger.load("flair/ner-english-large")

# # make example sentence
# text=st.text_area("Enter the text to detect it's named entities")
# sentence = Sentence(text)

# # predict NER tags
# tagger.predict(sentence)

# # print sentence
# print(sentence)

# # print predicted NER spans
# print('The following NER tags are found:')
# # iterate over entities and printx
# for entity in sentence.get_spans('ner'):
#     print(entity)



# import easyocr
# import cv2
# import requests
# import re
# from PIL import Image
# import streamlit as st
# # import os


# # Load the EasyOCR reader
# reader = easyocr.Reader(['en'])


# # key=os.environ.getattribute("api_key")
# # print(key)
# API_URL = "https://api-inference.huggingface.co/models/flair/ner-english-large"
# headers = {"Authorization": st.secrets["api_key"]}

# ## Image uploading function ##
# def image_upload_and_ocr(reader):
#     uploaded_file=st.file_uploader(label=':red[**please upload a busines card** :sunglasses:]',type=['jpeg','jpg','png','webp'])
#     if uploaded_file is not None:
#         image=Image.open(uploaded_file)
#         image=image.resize((640,480))
#         result2 = reader.readtext(image)
#         # result2=result
#         texts = [item[1] for item in result2]
#         result=' '.join(texts)
#     return result2,result


# def query(payload):
#     response = requests.post(API_URL, headers=headers, json=payload)
#     return response.json()

# def get_ner_from_transformer(output):
#     data = output
#     named_entities = {}
#     for entity in data:
#         entity_type = entity['entity_group']
#         entity_text = entity['word']
        
#         if entity_type not in named_entities:
#             named_entities[entity_type] = []
        
#         named_entities[entity_type].append(entity_text)
    
#     # for entity_type, entities in named_entities.items():
#         # print(f"{entity_type}: {', '.join(entities)}")
#     return entity_type,named_entities
    
        

        
#     ###  DRAWING DETECTION FUNCTION  ###
# def drawing_detection(image):
#     # Draw bounding boxes around the detected text regions
#     for detection in image:
#         # Extract the bounding box coordinates
#         points = detection[0]  # List of points defining the bounding box
#         x1, y1 = int(points[0][0]), int(points[0][1])  # Top-left corner
#         x2, y2 = int(points[2][0]), int(points[2][1])  # Bottom-right corner
        
#         # Draw the bounding box
#         cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
    
#         # Add the detected text
#         text = detection[1]
#         cv2.putText(image, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
#         st.image(image,caption='Detected text on the card ',width=710)
#     return image



# st.title("_Business_ card data extractor using opencv and streamlit :sunglasses:")
# res2,res=image_upload_and_ocr(reader)
# darwing_image=drawing_detection(res2)

 
# output = query({
#     "inputs": res,
# })

# entity_type,named_entities= get_ner_from_transformer(output)
# st.write(entity_type)
# st.write(named_entities)




import easyocr
import cv2
import requests
import re
from PIL import Image
import streamlit as st
import numpy as np

# Load the EasyOCR reader
reader = easyocr.Reader(['en'])

API_URL = "https://api-inference.huggingface.co/models/flair/ner-english-large"
headers = {"Authorization": st.secrets["api_key"]}

## Image uploading function ##
def image_upload_and_ocr(reader, uploaded_file):
    if uploaded_file is not None:
        image = Image.open(uploaded_file)
        image = image.resize((640, 480))
        
        image_np = np.array(image)  # Convert image to NumPy array
        result2 = reader.readtext(image_np)
        texts = [item[1] for item in result2]
        result = ' '.join(texts)
        
        return result2, result, image
    else:
        return None, None, None

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def get_ner_from_transformer(output):
    data = output
    named_entities = {}
    for entity in data:
        entity_type = entity['entity_group']
        entity_text = entity['word']
        
        if entity_type not in named_entities:
            named_entities[entity_type] = []
        
        named_entities[entity_type].append(entity_text)
    
    return entity_type, named_entities

def drawing_detection(res2, image):
    cv2_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    # Draw bounding boxes around the detected text regions
    for detection in res2:
        # Extract the bounding box coordinates
        points = detection[0]  # List of points defining the bounding box
        x1, y1 = int(points[0][0]), int(points[0][1])  # Top-left corner
        x2, y2 = int(points[2][0]), int(points[2][1])  # Bottom-right corner
        
        # Draw the bounding box
        cv2.rectangle(cv2_image, (x1, y1), (x2, y2), (255, 0, 0), 1)
    
        # Add the detected text
        text = detection[1]
        cv2.putText(cv2_image, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
    
    st.image(cv2_image, caption='Detected text on the card', width=710)
    return cv2_image

# Function to extract phone numbers from text using regular expression
def extract_phone_numbers(text):
    # Regular expression pattern for detecting phone numbers
    PHONE_PATTERN = r'(?:ph|phone|phno)?\s*(?:[+-]?\d\s*[\(\)]*){7,}'

    # Find phone numbers using regular expression
    phone_numbers = re.findall(PHONE_PATTERN, text, re.IGNORECASE)
    # Return the extracted phone numbers
    return phone_numbers or None

# Function to extract email addresses from text using regular expression
def extract_email(text):
    emails = []
    # Regular expression pattern for detecting email addresses with variations
    reg = r'[a-z0-9_.-]+(?:\s*@\s*)[a-z]+(?:\s*\.?\s*[a-z]{2,3})\s*'
    # Find email addresses using regular expression
    res = re.findall(reg, text, re.IGNORECASE)
    # Print the extracted email addresses
    for email in res:
        emails.append(email.strip())
    return emails or None

# Function to extract designations from text using regular expression
def extract_designation(text):
    designations = []
    # Regular expression pattern for detecting designations
    designation_regex = r'\b(?:CEO|CFO|CTO|COO|CMO|CIO|President|Vice\s?President|Director|Manager|Executive\s?Director|Assistant\s?Manager|Account\s?Manager|Sales\s?Manager|Marketing\s?Manager|Product\s?Manager|Project\s?Manager|HR\s?Manager|Human\s?Resources\s?Manager|Operations\s?Manager|Business\s?Development\s?Manager|Senior\s?Manager|General\s?Manager|Team\s?Lead|Consultant|Analyst|Engineer|Architect|Designer|Developer|Programmer|Coordinator|Specialist|Supervisor|Administrator|Assistant|Associate|Partner|Founder|Owner|Principal|Expert|Technician|Officer|Representative|Agent|Accountant|Auditor|Trainer|Coach|Educator|Professor|Instructor|Researcher|Scientist|Doctor|Nurse|Therapist|Pharmacist|Attorney|Lawyer|Legal\s?Counsel|Paralegal|Advocate|Solicitor|Notary|Financial\s?Advisor|Investment\s?Advisor|Wealth\s?Manager|Broker|Realtor|Mortgage\s?Broker|Insurance\s?Agent)\b'

    # Find designations using regular expression
    designations = re.findall(designation_regex, text, re.IGNORECASE)

    return designations or None

# Function to extract website URLs from text using regular expression
def extract_websites(text):
    websites_found=[]
    pattern = r'(https?://)?(www\.)?(\w+)(\.\w+)+'
    websites = re.findall(pattern, text)
    return ["".join(website) for website in websites] or None

# Function to extract PIN codes from text using regular expression
def extract_pin_code(text):
    pin_code_pattern = r'\b\d{6}\b'
    pin_code_match = re.search(pin_code_pattern, text.lower())
    
    # Retrieve the PIN code if found
    if pin_code_match:
        pin_code = pin_code_match.group()
        return pin_code
    else:
        return None

import pandas as pd

# Streamlit UI
st.title("Business Card Data Extractor using OpenCV and Streamlit")

uploaded_file = st.file_uploader(label="Please upload a business card", type=['jpeg', 'jpg', 'png', 'webp'], accept_multiple_files=False)

if uploaded_file is not None:
    res2, res, image = image_upload_and_ocr(reader, uploaded_file)
    
    if res2 is not None:
        drawing_image = drawing_detection(res2, image)

        try:
            output = query({
                "inputs": res,
            })

            entity_type, named_entities = get_ner_from_transformer(output)
        except Exception as e:
            st.error("An error occurred while processing the business card. Please try again later.")
            st.error(f"Error details: {str(e)}")

        extracted_data = {}

        # Function to extract person's name
        # Assuming the person's name is extracted by NER
        names = named_entities.get("PER", [])
        if names:
            selected_name = st.selectbox("Select Person's Name:", [""] + names)
            if selected_name:
                extracted_data["Name"] = selected_name
            else:
                manual_name = st.text_input("Enter Person's Name manually:")
                if manual_name:
                    extracted_data["Name"] = manual_name

        # Function to extract designations
        designations = extract_designation(res)
        if designations is not None:
            selected_designation = st.selectbox("Select Designation:", [""] + designations)
            if selected_designation:
                extracted_data["Designation"] = selected_designation
            else:
                manual_designation = st.text_input("Enter Designation manually:")
                if manual_designation:
                    extracted_data["Designation"] = manual_designation

        # Function to extract company names
        # Assuming the organization names extracted by NER represent company names
        company_names = named_entities.get("ORG", [])
        if company_names:
            selected_company_name = st.selectbox("Select Company Name:", [""] + company_names)
            if selected_company_name:
                extracted_data["Company Name"] = selected_company_name
            else:
                manual_company_name = st.text_input("Enter Company Name manually:")
                if manual_company_name:
                    extracted_data["Company Name"] = manual_company_name

        # Function to extract email addresses
        emails = extract_email(res)
        if emails is not None:
            selected_email = st.selectbox("Select Email:", [""] + emails)
            if selected_email:
                extracted_data["Email"] = selected_email
            else:
                manual_email = st.text_input("Enter Email manually:")
                if manual_email:
                    extracted_data["Email"] = manual_email

        # Function to extract website URLs
        websites = extract_websites(res)
        if websites is not None:
            selected_website = st.selectbox("Select Website:", [""] + websites)
            if selected_website:
                extracted_data["Website"] = selected_website
            else:
                manual_website = st.text_input("Enter Website manually:")
                if manual_website:
                    extracted_data["Website"] = manual_website

        # Function to extract phone numbers
        phone_numbers = extract_phone_numbers(res)
        if phone_numbers is not None:
            selected_phone_number = st.selectbox("Select Phone Number:", [""] + phone_numbers)
            if selected_phone_number:
                extracted_data["Phone Number"] = selected_phone_number
            else:
                manual_phone_number = st.text_input("Enter Phone Number manually:")
                if manual_phone_number:
                    extracted_data["Phone Number"] = manual_phone_number

       # Concatenate all the text returned by the API for location
        locations = named_entities.get("LOC", [])
        if locations:
            concatenated_location = ", ".join(locations)
            selected_location = st.selectbox("Select Location:", [""] + [concatenated_location])
            if selected_location:
                extracted_data["Location"] = selected_location
            else:
                manual_location = st.text_input("Enter Location manually:")
                if manual_location:
                    extracted_data["Location"] = manual_location
        else:
            manual_location = st.text_input("Enter Location manually:")
            if manual_location:
                extracted_data["Location"] = manual_location


        # Function to extract PIN codes
        pin_code = extract_pin_code(res)
        if pin_code is not None:
            selected_pin_code = st.selectbox("Select PIN Code:", ["", pin_code])
            if selected_pin_code:
                extracted_data["PIN Code"] = selected_pin_code
            else:
                manual_pin_code = st.text_input("Enter PIN Code manually:")
                if manual_pin_code:
                    extracted_data["PIN Code"] = manual_pin_code

        # Display extracted data
        if extracted_data:
            st.write("Extracted Data:")
            df = pd.DataFrame([extracted_data], columns=["Name", "Designation", "Company Name", "Email", "Website", "Phone Number", "Location", "PIN Code"])
            st.write(df)