RAG-Retrieve-Ingest-cz-eng / support /process_archives.py
Teapack1's picture
initial
99afe26
import pypff
import re
from datetime import datetime
import pandas as pd
print(f"[INFO] pypff version: {pypff.get_version()}")
DATA_PATH = "data\Archiv_2023_2.pst"
# Patterns to match and remove
patterns = [
r"\bM\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers
r"\bP\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers
r"\S+@\S+", # Email addresses
r"http[s]?://\S+", # URLs
r"preciosalighting\.com\s*<",
r"Facebook\s*<", # Social Media links
r"Instagram\s*<",
r"Youtube\s*<",
r"Pinterest\s*<",
r"Linkedin\s*<",
r"_+", # Line of underscores
# Czech legal disclaimer
r"Tento e-mail je určen pouze.*od odesílatele k adresátovi\.",
# English legal disclaimer
r"This e-mail transmission is intended solely.*from the sender to the recipient\.",
r"From:.*\n?",
r"Sent:.*\n?",
r"To:.*\n?",
r"Cc:.*\n?",
r"Subject:.*\n?",
r";", # Semicolons
r"[^\w\s,.]",
]
def extract_emails(pst_file):
opened_pst = pypff.open(pst_file)
root = opened_pst.get_root_folder()
emails = []
def process_folder(folder):
for folder in folder.sub_folders:
process_folder(folder)
for message in folder.sub_messages:
emails.append(
{
"subject": message.subject,
"body": message.plain_text_body,
"sender": message.sender_name,
"date": message.delivery_time,
}
)
process_folder(root)
return emails
def format_item(item, patterns):
date = item["date"].strftime("%Y-%m-%d")
body = item["body"].decode("utf-8")
for pattern in patterns:
body = re.sub(pattern, "", body)
body = re.sub("\s+", " ", body).strip()
return {
"subject": item["subject"],
"body": body,
"sender": item["sender"],
"date": date,
}
def main():
dataset_list = []
emails = extract_emails(DATA_PATH)
for email in emails:
dataset_list.append(format_item(email, patterns))
df = pd.DataFrame(dataset_list)
df.head()
df.to_csv("data\emails.csv", index=True, header=True, sep=";")
if __name__ == "__main__":
main()