Spaces:
Sleeping
Sleeping
File size: 1,088 Bytes
5207833 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
import spacy
from transformers import pipeline
import re
from dateutil.parser import parse
# Regex pattern for dates
def extract_entities(email_text, nlp, ner_pipeline):
date_pattern = r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:th|st|nd|rd)?,\s+\d{4}\b'
# Use spaCy for initial extraction
doc = nlp(email_text)
spacy_entities = [{"Text": ent.text, "Type": ent.label_} for ent in doc.ents]
# Use transformer model for refined extraction
transformer_entities = ner_pipeline(email_text)
transformer_entities = [{"Text": ent['word'], "Type": ent['entity'], "Score": ent['score']} for ent in transformer_entities if ent['score'] > 0.75]
# Extract dates using regex
potential_dates = re.findall(date_pattern, email_text)
dates = [parse(date).strftime('%Y-%m-%d') for date in potential_dates]
return {
"spaCy Entities": spacy_entities,
"Transformer Entities": transformer_entities,
"Dates": dates
}
|