Spaces:
Sleeping
Sleeping
File size: 3,411 Bytes
1957a2b ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b 03b0230 ec38d9f 1957a2b 856356f ec38d9f 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b 9aab5d8 ec38d9f 1957a2b dffaefe ec38d9f 03b0230 ec38d9f dffaefe ec38d9f dffaefe ec38d9f dffaefe ec38d9f dffaefe ec38d9f 1957a2b 536f053 ec38d9f 536f053 1957a2b ec38d9f 1957a2b ec38d9f 1957a2b 1853d90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
import pandas as pd
from clean import clean_data
from report import create_full_report, REPORT_DIR
import os
import tempfile
def clean_and_visualize(file, primary_key_column, progress=gr.Progress()):
# Load the data
df = pd.read_csv(file.name)
# Remove duplicates from the primary key column
df = df.drop_duplicates(subset=[primary_key_column], keep='first')
# Clean the data
cleaned_df = None
nonconforming_cells_before = None
process_times = None
removed_columns = None
removed_rows = None
for progress_value, status_text in clean_data(df, primary_key_column):
if isinstance(status_text, tuple):
cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows = status_text
progress(progress_value, desc="Cleaning completed")
else:
progress(progress_value, desc=status_text)
# Generate full visualization report
create_full_report(
df,
cleaned_df,
nonconforming_cells_before,
process_times,
removed_columns,
removed_rows,
primary_key_column
)
# Save cleaned DataFrame to a temporary CSV file
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
cleaned_df.to_csv(tmp_file.name, index=False)
cleaned_csv_path = tmp_file.name
# Collect all generated images
image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
return cleaned_csv_path, image_files
def launch_app():
with gr.Blocks() as app:
gr.Markdown("# AI Data Cleaner")
with gr.Row():
file_input = gr.File(label="Upload CSV File", file_count="single", file_types=[".csv"])
with gr.Row():
primary_key_dropdown = gr.Dropdown(label="Select Primary Key Column", choices=[], interactive=True)
with gr.Row():
clean_button = gr.Button("Start Cleaning")
with gr.Row():
progress_bar = gr.Progress()
with gr.Row():
cleaned_file_output = gr.File(label="Cleaned CSV", visible=True)
with gr.Row():
output_gallery = gr.Gallery(
label="Visualization Results",
show_label=True,
elem_id="gallery",
columns=[3],
rows=[3],
object_fit="contain",
height="auto",
visible=False
)
def update_primary_key_options(file):
if file is not None:
df = pd.read_csv(file.name)
return gr.Dropdown(choices=df.columns.tolist())
def process_and_show_results(file, primary_key_column):
cleaned_csv_path, image_files = clean_and_visualize(file, primary_key_column, progress=progress_bar)
return (
cleaned_csv_path,
gr.Gallery(visible=True, value=image_files)
)
file_input.change(
fn=update_primary_key_options,
inputs=file_input,
outputs=primary_key_dropdown
)
clean_button.click(
fn=process_and_show_results,
inputs=[file_input, primary_key_dropdown],
outputs=[cleaned_file_output, output_gallery]
)
app.launch()
if __name__ == "__main__":
launch_app() |