Spaces:
Runtime error
Runtime error
smaller sample size
Browse files
app.py
CHANGED
@@ -5,13 +5,11 @@ import os
|
|
5 |
enable_xorbits = False
|
6 |
|
7 |
if enable_xorbits:
|
8 |
-
import xorbits.pandas as pd
|
9 |
-
import xorbits.numpy as np
|
10 |
import xorbits
|
11 |
xorbits.init()
|
|
|
12 |
else:
|
13 |
import pandas as pd
|
14 |
-
import numpy as np
|
15 |
|
16 |
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
|
17 |
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
|
@@ -65,7 +63,7 @@ with st.spinner('Loading meta'):
|
|
65 |
hf_datasets = get_hugging_face_dataset(dataset_name)
|
66 |
subsets = set([x['config'] for x in hf_datasets['parquet_files']])
|
67 |
subset_option = st.sidebar.selectbox("Choose a subset", subsets)
|
68 |
-
sample_rate_option = st.sidebar.slider('Select sample rate', value=0.
|
69 |
|
70 |
tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
|
71 |
["Introduction", "Junk Data🤖", "Biased Content🛡️", "Short Documents🌐", "Contamination🧹", "Duplication🔍"])
|
@@ -159,7 +157,6 @@ This piece of Python code calculated a measure of "impurity" in text documents,
|
|
159 |
|
160 |
with st.spinner('Calculating impurity ratio...'):
|
161 |
df = datasets['train']
|
162 |
-
|
163 |
import re
|
164 |
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
|
165 |
|
|
|
5 |
enable_xorbits = False
|
6 |
|
7 |
if enable_xorbits:
|
|
|
|
|
8 |
import xorbits
|
9 |
xorbits.init()
|
10 |
+
import xorbits.pandas as pd
|
11 |
else:
|
12 |
import pandas as pd
|
|
|
13 |
|
14 |
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
|
15 |
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
|
|
|
63 |
hf_datasets = get_hugging_face_dataset(dataset_name)
|
64 |
subsets = set([x['config'] for x in hf_datasets['parquet_files']])
|
65 |
subset_option = st.sidebar.selectbox("Choose a subset", subsets)
|
66 |
+
sample_rate_option = st.sidebar.slider('Select sample rate', value=0.01, min_value=0.1, max_value=1.0, step=0.1)
|
67 |
|
68 |
tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
|
69 |
["Introduction", "Junk Data🤖", "Biased Content🛡️", "Short Documents🌐", "Contamination🧹", "Duplication🔍"])
|
|
|
157 |
|
158 |
with st.spinner('Calculating impurity ratio...'):
|
159 |
df = datasets['train']
|
|
|
160 |
import re
|
161 |
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
|
162 |
|