import pypff import re from datetime import datetime import pandas as pd print(f"[INFO] pypff version: {pypff.get_version()}") DATA_PATH = "data\Archiv_2023_2.pst" # Patterns to match and remove patterns = [ r"\bM\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers r"\bP\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers r"\S+@\S+", # Email addresses r"http[s]?://\S+", # URLs r"preciosalighting\.com\s*<", r"Facebook\s*<", # Social Media links r"Instagram\s*<", r"Youtube\s*<", r"Pinterest\s*<", r"Linkedin\s*<", r"_+", # Line of underscores # Czech legal disclaimer r"Tento e-mail je určen pouze.*od odesílatele k adresátovi\.", # English legal disclaimer r"This e-mail transmission is intended solely.*from the sender to the recipient\.", r"From:.*\n?", r"Sent:.*\n?", r"To:.*\n?", r"Cc:.*\n?", r"Subject:.*\n?", r";", # Semicolons r"[^\w\s,.]", ] def extract_emails(pst_file): opened_pst = pypff.open(pst_file) root = opened_pst.get_root_folder() emails = [] def process_folder(folder): for folder in folder.sub_folders: process_folder(folder) for message in folder.sub_messages: emails.append( { "subject": message.subject, "body": message.plain_text_body, "sender": message.sender_name, "date": message.delivery_time, } ) process_folder(root) return emails def format_item(item, patterns): date = item["date"].strftime("%Y-%m-%d") body = item["body"].decode("utf-8") for pattern in patterns: body = re.sub(pattern, "", body) body = re.sub("\s+", " ", body).strip() return { "subject": item["subject"], "body": body, "sender": item["sender"], "date": date, } def main(): dataset_list = [] emails = extract_emails(DATA_PATH) for email in emails: dataset_list.append(format_item(email, patterns)) df = pd.DataFrame(dataset_list) df.head() df.to_csv("data\emails.csv", index=True, header=True, sep=";") if __name__ == "__main__": main()