File size: 2,228 Bytes
99afe26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pypff
import re
from datetime import datetime
import pandas as pd

print(f"[INFO] pypff version: {pypff.get_version()}")

DATA_PATH = "data\Archiv_2023_2.pst"

# Patterns to match and remove
patterns = [
    r"\bM\s\+\d{3}\d{3}\d{3}\d{3}\b",  # Phone numbers
    r"\bP\s\+\d{3}\d{3}\d{3}\d{3}\b",  # Phone numbers
    r"\S+@\S+",  # Email addresses
    r"http[s]?://\S+",  # URLs
    r"preciosalighting\.com\s*<",
    r"Facebook\s*<",  # Social Media links
    r"Instagram\s*<",
    r"Youtube\s*<",
    r"Pinterest\s*<",
    r"Linkedin\s*<",
    r"_+",  # Line of underscores
    # Czech legal disclaimer
    r"Tento e-mail je určen pouze.*od odesílatele k adresátovi\.",
    # English legal disclaimer
    r"This e-mail transmission is intended solely.*from the sender to the recipient\.",
    r"From:.*\n?",
    r"Sent:.*\n?",
    r"To:.*\n?",
    r"Cc:.*\n?",
    r"Subject:.*\n?",
    r";",  # Semicolons
    r"[^\w\s,.]",
]


def extract_emails(pst_file):
    opened_pst = pypff.open(pst_file)
    root = opened_pst.get_root_folder()

    emails = []

    def process_folder(folder):
        for folder in folder.sub_folders:
            process_folder(folder)
        for message in folder.sub_messages:
            emails.append(
                {
                    "subject": message.subject,
                    "body": message.plain_text_body,
                    "sender": message.sender_name,
                    "date": message.delivery_time,
                }
            )

    process_folder(root)
    return emails


def format_item(item, patterns):
    date = item["date"].strftime("%Y-%m-%d")
    body = item["body"].decode("utf-8")
    for pattern in patterns:
        body = re.sub(pattern, "", body)
    body = re.sub("\s+", " ", body).strip()

    return {
        "subject": item["subject"],
        "body": body,
        "sender": item["sender"],
        "date": date,
    }


def main():
    dataset_list = []
    emails = extract_emails(DATA_PATH)
    for email in emails:
        dataset_list.append(format_item(email, patterns))

    df = pd.DataFrame(dataset_list)
    df.head()
    df.to_csv("data\emails.csv", index=True, header=True, sep=";")


if __name__ == "__main__":
    main()