|
import re |
|
from PyPDF2 import PdfReader |
|
|
|
def preprocess_text(text): |
|
|
|
text = re.sub(r'\n|\t', '', text) |
|
|
|
|
|
text = re.sub(r'\s[A-Z]\s', ' ', text) |
|
|
|
|
|
text = re.sub(r'\S+@\S+', '', text) |
|
|
|
|
|
text = re.sub(r'\d{2}[-/]\d{2}[-/]\d{4}', '', text) |
|
|
|
|
|
text = re.sub(r'\+\d{2}\s?\d{2,3}\s?\d{3,4}\s?\d{4}', '', text) |
|
|
|
|
|
text = re.sub(r'Issued\s\w+\s\d{4}Credential ID \w+', '', text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text) |
|
|
|
return text |
|
|
|
def get_pdf_text(pdfs,preprocess=True): |
|
if preprocess: |
|
all_text = [] |
|
for pdf in pdfs: |
|
|
|
|
|
pdf_reader = PdfReader(pdf) |
|
|
|
|
|
filename = pdf.name |
|
|
|
text = "" |
|
|
|
for page in pdf_reader.pages: |
|
|
|
text += page.extract_text() |
|
|
|
text = preprocess_text(text) |
|
|
|
all_text.append({"filename": filename, "text": text}) |
|
return all_text |
|
|
|
else: |
|
text = "" |
|
for pdf in pdfs: |
|
|
|
|
|
pdf_reader = PdfReader(pdf) |
|
|
|
|
|
for page in pdf_reader.pages: |
|
|
|
text += page.extract_text() |
|
|
|
|
|
return text |