Kaludi commited on
Commit
3c540f6
·
1 Parent(s): 6753645

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +81 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import io
4
+ import base64
5
+
6
+ st.set_page_config(page_title="Data Cleaning Tool", layout="wide")
7
+
8
+ st.title("CSV Data Cleaning Tool")
9
+
10
+ st.markdown("Upload one or multiple CSV files to preprocess and clean your files quickly and stress free.")
11
+
12
+ uploaded_files = st.file_uploader("Choose CSV files", type="csv", accept_multiple_files=True)
13
+
14
+ dataframes = []
15
+
16
+ if uploaded_files:
17
+ for file in uploaded_files:
18
+ file.seek(0)
19
+ df = pd.read_csv(file)
20
+ dataframes.append(df)
21
+
22
+ if len(dataframes) > 1:
23
+ merge = st.checkbox("Merge uploaded CSV files")
24
+
25
+ if merge:
26
+ # Merge options
27
+ keep_first_header_only = st.selectbox("Keep only the header (first row) of the first file", ["Yes", "No"])
28
+ remove_duplicate_rows = st.selectbox("Remove duplicate rows", ["No", "Yes"])
29
+ remove_empty_rows = st.selectbox("Remove empty rows", ["Yes", "No"])
30
+ end_line = st.selectbox("End line", ["\\n", "\\r\\n"])
31
+
32
+ try:
33
+ if keep_first_header_only == "Yes":
34
+ for i, df in enumerate(dataframes[1:]):
35
+ df.columns = dataframes[0].columns.intersection(df.columns)
36
+ dataframes[i+1] = df
37
+
38
+ merged_df = pd.concat(dataframes, ignore_index=True, join='outer')
39
+
40
+ if remove_duplicate_rows == "Yes":
41
+ merged_df.drop_duplicates(inplace=True)
42
+
43
+ if remove_empty_rows == "Yes":
44
+ merged_df.dropna(how="all", inplace=True)
45
+
46
+ dataframes = [merged_df]
47
+
48
+ except ValueError as e:
49
+ st.error("Please make sure columns match in all files. If you don't want them to match, select 'No' in the first option.")
50
+ st.stop()
51
+
52
+ # Show or hide DataFrames
53
+ show_dataframes = st.checkbox("Show DataFrames", value=True)
54
+
55
+ if show_dataframes:
56
+ for i, df in enumerate(dataframes):
57
+ st.write(f"DataFrame {i + 1}")
58
+ st.dataframe(df)
59
+
60
+ if st.button("Download cleaned data"):
61
+ for i, df in enumerate(dataframes):
62
+ csv = df.to_csv(index=False)
63
+ b64 = base64.b64encode(csv.encode()).decode()
64
+ href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data_{i + 1}.csv">Download cleaned_data_{i + 1}.csv</a>'
65
+ st.markdown(href, unsafe_allow_html=True)
66
+ else:
67
+ st.warning("Please upload a CSV file.")
68
+ st.stop()
69
+
70
+ st.markdown("")
71
+ st.markdown("---")
72
+ st.markdown("")
73
+ st.markdown("<p style='text-align: center'><a href='https://github.com/Kaludii'>Github</a> | <a href='https://huggingface.co/Kaludi'>HuggingFace</a></p>", unsafe_allow_html=True)
74
+
75
+ hide_streamlit_style = """
76
+ <style>
77
+ #MainMenu {visibility: hidden;}
78
+ footer {visibility: hidden;}
79
+ </style>
80
+ """
81
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ openai
2
+ streamlit
3
+ PyPDF2
4
+ Pillow