Dreamsome commited on
Commit
7351996
·
1 Parent(s): 03563f3

smaller sample size

Browse files
Files changed (1) hide show
  1. app.py +2 -5
app.py CHANGED
@@ -5,13 +5,11 @@ import os
5
  enable_xorbits = False
6
 
7
  if enable_xorbits:
8
- import xorbits.pandas as pd
9
- import xorbits.numpy as np
10
  import xorbits
11
  xorbits.init()
 
12
  else:
13
  import pandas as pd
14
- import numpy as np
15
 
16
  st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
17
  st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
@@ -65,7 +63,7 @@ with st.spinner('Loading meta'):
65
  hf_datasets = get_hugging_face_dataset(dataset_name)
66
  subsets = set([x['config'] for x in hf_datasets['parquet_files']])
67
  subset_option = st.sidebar.selectbox("Choose a subset", subsets)
68
- sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
69
 
70
  tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
71
  ["Introduction", "Junk Data🤖", "Biased Content🛡️", "Short Documents🌐", "Contamination🧹", "Duplication🔍"])
@@ -159,7 +157,6 @@ This piece of Python code calculated a measure of "impurity" in text documents,
159
 
160
  with st.spinner('Calculating impurity ratio...'):
161
  df = datasets['train']
162
-
163
  import re
164
  RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
165
 
 
5
  enable_xorbits = False
6
 
7
  if enable_xorbits:
 
 
8
  import xorbits
9
  xorbits.init()
10
+ import xorbits.pandas as pd
11
  else:
12
  import pandas as pd
 
13
 
14
  st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
15
  st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
 
63
  hf_datasets = get_hugging_face_dataset(dataset_name)
64
  subsets = set([x['config'] for x in hf_datasets['parquet_files']])
65
  subset_option = st.sidebar.selectbox("Choose a subset", subsets)
66
+ sample_rate_option = st.sidebar.slider('Select sample rate', value=0.01, min_value=0.1, max_value=1.0, step=0.1)
67
 
68
  tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
69
  ["Introduction", "Junk Data🤖", "Biased Content🛡️", "Short Documents🌐", "Contamination🧹", "Duplication🔍"])
 
157
 
158
  with st.spinner('Calculating impurity ratio...'):
159
  df = datasets['train']
 
160
  import re
161
  RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
162