victormiller commited on
Commit
7381c06
1 Parent(s): 9fd7ac0

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +21 -5
curated.py CHANGED
@@ -1041,10 +1041,7 @@ def curated(request):
1041
  ),
1042
  )
1043
 
1044
- text = P("""Data preprocessing is a crucial step in the data science
1045
- pipeline. It involves cleaning and transforming raw data into a format that
1046
- is suitable for analysis. This process includes handling missing values,
1047
- normalizing data, encoding categorical variables, and more.""")
1048
 
1049
  preprocessing_steps = pd.DataFrame(
1050
  {
@@ -1098,7 +1095,26 @@ def curated(request):
1098
 
1099
  table_html = preprocessing_steps.to_html(index=False, border=0)
1100
  table_div = Div(NotStr(table_html), style="margin: 40px;")
1101
- data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1102
 
1103
  return Div(
1104
  overview,
 
1041
  ),
1042
  )
1043
 
1044
+
 
 
 
1045
 
1046
  preprocessing_steps = pd.DataFrame(
1047
  {
 
1095
 
1096
  table_html = preprocessing_steps.to_html(index=False, border=0)
1097
  table_div = Div(NotStr(table_html), style="margin: 40px;")
1098
+ data_preprocessing_div = Div(
1099
+ H2("Data Preprocessing"),
1100
+ P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
1101
+ H3("Language Filter"),
1102
+ P("The Language Filter removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
1103
+ H3("Minimum Word Count Filter"),
1104
+ P("The Minimum Word Count Filter sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
1105
+ H3("Unigram Log Probability"),
1106
+ P("The Unigram Log Probability Filter calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
1107
+ H2("Data Processing for S2ORC"),
1108
+ P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources.")
1109
+ H3("Title Abstract Filter"),
1110
+ P("The Title Abstract Filter extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
1111
+ H3("Majority Language Filter"),
1112
+ P("The Majority Language Filter identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
1113
+ H3("Paragraph Count Filter"),
1114
+ P("The Paragraph Count Filter counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
1115
+ H3("Frequency Filter"),
1116
+ P("The Frequency Filter calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
1117
+ )
1118
 
1119
  return Div(
1120
  overview,