Spaces:
Sleeping
Sleeping
File size: 1,519 Bytes
e062e72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from langchain_community.document_loaders import PyMuPDFLoader
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string
def load_pdf(file_path):
loader = PyMuPDFLoader(file_path)
data = loader.load()
return data
def clean_text(text):
# Remove special characters (customize as needed)
special_characters = "○●•◦"
text = re.sub(f"[{re.escape(special_characters)}]", "", text)
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Remove numbers
text = re.sub(r'\d+', '', text)
# Remove extra whitespace
text = " ".join(text.split())
# Convert text to lowercase
text = text.lower()
# Remove stopwords (optional)
stop_words = set(stopwords.words('english'))
text = " ".join(word for word in text.split() if word not in stop_words)
# Stemming (optional)
#ps = PorterStemmer()
#text = " ".join(ps.stem(word) for word in text.split())
#Lemmatization
lemmatizer = WordNetLemmatizer()
text= " ".join(lemmatizer.lemmatize(word) for word in text.split())
return text
def get_full_resume_text(file_path):
resume_pages = load_pdf(file_path)
resume_text = ""
for page in resume_pages:
resume_text += page.page_content
resume_text += "\n\n"
resume_text = clean_text(resume_text)
return resume_text
def process_pdf(file):
return get_full_resume_text(file.name)
|