In [1]:
import xml.etree.ElementTree as ET
import re
import os
import json

In [2]:
cleaned_data = []
cleaned_test_data = []

# FUNCTION TO CLEAN DATA FILES IN SPECIFIC DIRECTORIES

In [3]:
def clean_data(summary_path : str, bill_path : str):
    sums_path = summary_path
    bills_path = bill_path
    
    # GETTING A LIST OF THE FILE NAMES FOR THE SUMMARY DATA FILES
    sums_dir_list = []
    for x in os.listdir(sums_path):
        if x.endswith(".xml"):
            sums_dir_list.append(x)
    sums_dir_list = sorted(sums_dir_list)

    # GETTING A LIST OF THE FILE NAMES FOR THE BILL DATA FILES
    bills_dir_list = []
    for x in os.listdir(bills_path):
        if x.endswith(".xml"):
            bills_dir_list.append(x)
    bills_dir_list = sorted(bills_dir_list)
    
    for i in range(len(sums_dir_list)):
        # SUMMARY
        summary_texts = []
        sum_path = sums_path + sums_dir_list[i]
        tree_sum = ET.parse(sum_path)
        root_sum = tree_sum.getroot()
        for description in root_sum.iter('summary-text'):
            temp = description.text
            temp = re.sub('<[^>]+>', '', temp)
            temp = re.sub('\s+', ' ', temp)
            temp = re.sub('^[\s]+', '', temp)
            temp = re.sub('[\s]+$', '', temp)
            summary_texts.append(str(temp))
        summary = summary_texts[0]
    
        # TITLE
        title = ""
        for title in root_sum.iter("title"):
            title = title.text
    
        # BILL
        bill_path = bills_path + bills_dir_list[i]
        tree_bill = ET.parse(sum_path)
        root_bill = tree_bill.getroot()
        bill = ""
        for child in root_bill.iter():
            temp = child.text
            bill = bill + temp + " "
        bill = re.sub('<[^>]+>', '', bill)
        bill = re.sub('\s+', ' ', bill)
        bill = re.sub('^[\s]+', '', bill)
        bill = re.sub('[\s]+$', '', bill)
    
        # MERGE TO LIST OF DICTIONARIES
        cleaned_data.append({"summary" : summary, "text" : bill, "title" : title})

# CALLING CLEAN DATA FUNCTION ON ALL DATA FOLDERS

In [None]:
clean_data("./data/sums/117/", "./data/bills/117")

In [None]:
clean_data("./data/sums/116/", "./data/bills/116")

In [None]:
clean_data("./data/sums/115/", "./data/bills/115")

In [None]:
clean_data("./data/sums/114/", "./data/bills/114")

In [None]:
clean_data("./data/sums/113/", "./data/bills/113")

# CHECKING FOR CORRECT LENGTH AND SPOT CHECKING DATA

In [None]:
len(cleaned_data)

In [None]:
cleaned_data[19]

# SAVING CLEANED DATA AS JSON FILE TO TRANSFER TO TEMU

In [None]:
# Convert and write JSON object to file
with open("cleaned_bill_sum_data.json", "w") as outfile: 
    json.dump(cleaned_data, outfile)

# IMPORTING CLEANED DATA JSON FILE TO A LIST OF DICTIONARIES

In [None]:
# Opening JSON file
with open("cleaned_bill_sum_data.json") as json_file:
    cleaned_data = json.load(json_file)

In [None]:
# import datasets
# from datasets import load_dataset
# import pandas as pd
# from datasets import Dataset

In [None]:
# data = pd.DataFrame.from_dict(cleaned_data)

In [None]:
# dataset = Dataset.from_pandas(pd.DataFrame(data=data))

In [None]:
# billsum = dataset.train_test_split(test_size=0.2)

# FUNCTION TO CLEAN THE TEST DATA

In [None]:
def clean_test_data(summary_path : str, bill_path : str):
    sums_path = summary_path
    bills_path = bill_path
    
    # GETTING A LIST OF THE FILE NAMES FOR THE SUMMARY DATA FILES
    sums_dir_list = []
    for x in os.listdir(sums_path):
        if x.endswith(".xml"):
            sums_dir_list.append(x)
    sums_dir_list = sorted(sums_dir_list)

    # GETTING A LIST OF THE FILE NAMES FOR THE BILL DATA FILES
    bills_dir_list = []
    for x in os.listdir(bills_path):
        if x.endswith(".xml"):
            bills_dir_list.append(x)
    bills_dir_list = sorted(bills_dir_list)
    
    for i in range(len(sums_dir_list)):
        # SUMMARY
        summary_texts = []
        sum_path = sums_path + sums_dir_list[i]
        tree_sum = ET.parse(sum_path)
        root_sum = tree_sum.getroot()
        for description in root_sum.iter('summary-text'):
            temp = description.text
            temp = re.sub('<[^>]+>', '', temp)
            temp = re.sub('\s+', ' ', temp)
            temp = re.sub('^[\s]+', '', temp)
            temp = re.sub('[\s]+$', '', temp)
            summary_texts.append(str(temp))
        summary = summary_texts[0]
    
        # TITLE
        title = ""
        for title in root_sum.iter("title"):
            title = title.text
    
        # BILL
        bill_path = bills_path + bills_dir_list[i]
        tree_bill = ET.parse(sum_path)
        root_bill = tree_bill.getroot()
        bill = ""
        for child in root_bill.iter():
            temp = child.text
            bill = bill + temp + " "
        bill = re.sub('<[^>]+>', '', bill)
        bill = re.sub('\s+', ' ', bill)
        bill = re.sub('^[\s]+', '', bill)
        bill = re.sub('[\s]+$', '', bill)
    
        # MERGE TO LIST OF DICTIONARIES
        cleaned_test_data.append({"summary" : summary, "text" : bill, "title" : title})

# CALLING FUNCTION TO CLEAN THE TEST DATA

In [None]:
clean_test_data("./data/test/sums/", "./data/test/bills/")

# SHOWING A SAMPLE OF THE TEST DATA

In [None]:
cleaned_test_data[11]

# SAVING CLEANED TEST DATA TO JSON

In [None]:
# Convert and write JSON object to file
with open("cleaned_bill_sum_test_data.json", "w") as outfile: 
    json.dump(cleaned_test_data, outfile)

# LOADING CLEANED TEST DATA

In [None]:
# Opening JSON file
with open("cleaned_bill_sum_test_data.json") as json_file:
    cleaned_test_data = json.load(json_file)

# SINGLE FILE CLEANING

In [4]:
new_cleaned_test_data = []

In [5]:
def single_file_cleaning(summary : str, bill : str):
    sum_path = summary
    bill_path = bill
    
    # SUMMARY
    summary_texts = []
    # sum_path = sums_path + sums_dir_list[i]
    tree_sum = ET.parse(sum_path)
    root_sum = tree_sum.getroot()
    for description in root_sum.iter('summary-text'):
        temp = description.text
        temp = re.sub('<[^>]+>', '', temp)
        temp = re.sub('\s+', ' ', temp)
        temp = re.sub('^[\s]+', '', temp)
        temp = re.sub('[\s]+$', '', temp)
        summary_texts.append(str(temp))
    summary = summary_texts[0]
    
    # TITLE
    title = ""
    for title in root_sum.iter("title"):
        title = title.text
    
    # BILL
    # bill_path = bills_path + bills_dir_list[i]
    tree_bill = ET.parse(sum_path)
    root_bill = tree_bill.getroot()
    bill = ""
    for child in root_bill.iter():
        temp = child.text
        bill = bill + temp + " "
    bill = re.sub('<[^>]+>', '', bill)
    bill = re.sub('\s+', ' ', bill)
    bill = re.sub('^[\s]+', '', bill)
    bill = re.sub('[\s]+$', '', bill)
    
    # MERGE TO LIST OF DICTIONARIES
    new_cleaned_test_data.append({"summary" : summary, "text" : bill, "title" : title})

In [6]:
single_file_cleaning("./BILLSUM-118s1000.xml", "./BILLS-118s1000is.xml")

In [7]:
new_cleaned_test_data

[{'summary': 'Saving Access to Laboratory Services Act This bill modifies provisions relating to Medicare payment rates for clinical diagnostic laboratory services, including by requiring payment rates for certain widely available clinical diagnostic laboratory tests to be based on a statistical sampling of private sector rates.',
  'text': 'Saving Access to Laboratory Services Act 2023-03-28 Introduced in Senate Saving Access to Laboratory Services Act This bill modifies provisions relating to Medicare payment rates for clinical diagnostic laboratory services, including by requiring payment rates for certain widely available clinical diagnostic laboratory tests to be based on a statistical sampling of private sector rates. text/xml EN Pursuant to Title 17 Section 105 of the United States Code, this file is not subject to copyright protection and is in the public domain. Congressional Research Service, Library of Congress This file contains bill summaries for federal legislation. A bil

In [8]:
single_file_cleaning("./BILLSUM-118s1016.xml", "./BILLS-118s1016is.xml")

In [9]:
new_cleaned_test_data

[{'summary': 'Saving Access to Laboratory Services Act This bill modifies provisions relating to Medicare payment rates for clinical diagnostic laboratory services, including by requiring payment rates for certain widely available clinical diagnostic laboratory tests to be based on a statistical sampling of private sector rates.',
  'text': 'Saving Access to Laboratory Services Act 2023-03-28 Introduced in Senate Saving Access to Laboratory Services Act This bill modifies provisions relating to Medicare payment rates for clinical diagnostic laboratory services, including by requiring payment rates for certain widely available clinical diagnostic laboratory tests to be based on a statistical sampling of private sector rates. text/xml EN Pursuant to Title 17 Section 105 of the United States Code, this file is not subject to copyright protection and is in the public domain. Congressional Research Service, Library of Congress This file contains bill summaries for federal legislation. A bil

In [10]:
# Convert and write JSON object to file
with open("new_cleaned_bill_sum_test_data.json", "w") as outfile: 
    json.dump(new_cleaned_test_data, outfile)