laudavid commited on
Commit
ce45b2a
1 Parent(s): ae65ca1

latest version of app

Browse files
main_page.py CHANGED
@@ -44,21 +44,21 @@ col1, col2 = st.columns([0.65,0.35], gap="medium")
44
  with col1:
45
  st.title("AI and Data Science Examples")
46
  st.subheader("HEC Paris, 2023-2024")
47
- st.markdown("""**Course provided by Shirish C. SRIVASTAVA** <br>
48
- **Hi! PARIS Engineering team**: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN""", unsafe_allow_html=True)
49
  #st.markdown("in collaboration with Hi! PARIS engineers: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN")
50
 
51
- with col2:
52
  #Hi! PARIS collaboration mention
53
- st.markdown(" ")
54
- st.markdown(" ")
55
- st.markdown(" ")
56
- image_hiparis = Image.open('images/hi-paris.png')
57
- st.image(image_hiparis, width=150)
58
 
59
  url = "https://www.hi-paris.fr/"
60
  #st.markdown("This app was funded by the Hi! PARIS Center")
61
- st.markdown("""###### **Made in collaboration with [Hi! PARIS](%s)** """ % url, unsafe_allow_html=True)
 
 
62
 
63
 
64
  st.markdown(" ")
@@ -114,23 +114,25 @@ show_pages(
114
  st.header("About the app")
115
 
116
 
117
- st.info("""The **AI and Data Science Examples** app was created as a tool to introduce students to the field of Data Science by showcasing real-life applications of AI.
118
- It includes use cases using traditional Machine Learning algorithms on structured data, as well as models that analyze unstructured data (text, images,...).""")
119
 
120
  st.markdown(" ")
121
 
122
- st.markdown("""The app is structured into three sections:
123
- - 1️⃣ **Machine Learning**: This first section covers use cases where structured data (data in a tabular format) is fed to an AI model.
124
  You will find pages on *Supervised/Unsupervised Learning*, *Time Series Forecasting* and AI powered *Recommendation Systems*.
125
  - 2️⃣ **Natural Language Processing** (NLP): This second section showcases AI applications where large amounts of text data is analyzed using Deep Learning models.
126
- Pages on *Topic Modeling* and *Sentiment Analysis*, which are types of NLP models, can be found in this section.
127
- - 3️⃣ **Computer Vision**: This final section covers a sub-field of AI called Computer Vision which deals with image/video data.
128
  The field of Computer Vision includes *Image classification* and *Object Detection*, which are both featured in this section.
 
 
129
  """)
130
 
131
  st.image("images/ML_domains.png",
132
- caption="""This figure showcases a selection of sub-fields in Artificial Intelligence, such as traditional
133
- Machine Learning, NLP, Computer Vision and Robotics.""")
134
 
135
 
136
  # st.markdown(" ")
 
44
  with col1:
45
  st.title("AI and Data Science Examples")
46
  st.subheader("HEC Paris, 2023-2024")
47
+ # st.markdown("""**Course provided by Shirish C. SRIVASTAVA** <br>
48
+ # **Hi! PARIS Engineering team**: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN""", unsafe_allow_html=True)
49
  #st.markdown("in collaboration with Hi! PARIS engineers: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN")
50
 
51
+ # with col2:
52
  #Hi! PARIS collaboration mention
53
+ # st.markdown(" ")
54
+ # st.markdown(" ")
55
+ #st.markdown(" ")
 
 
56
 
57
  url = "https://www.hi-paris.fr/"
58
  #st.markdown("This app was funded by the Hi! PARIS Center")
59
+ st.markdown("""###### **The app was made in collaboration with [Hi! PARIS](%s)** """ % url, unsafe_allow_html=True)
60
+ image_hiparis = Image.open('images/hi-paris.png')
61
+ st.image(image_hiparis, width=150)
62
 
63
 
64
  st.markdown(" ")
 
114
  st.header("About the app")
115
 
116
 
117
+ st.info("""The goal of the **AI and Data Science Examples** is to give an introduction to Data Science by showcasing real-life applications.
118
+ The app includes use cases using traditional Machine Learning algorithms on structured data, as well as models that analyze unstructured data (text, images,...).""")
119
 
120
  st.markdown(" ")
121
 
122
+ st.markdown("""The app contains four sections:
123
+ - 1️⃣ **Machine Learning**: This first section covers use cases where structured data (data in a tabular format) is used to train an AI model.
124
  You will find pages on *Supervised/Unsupervised Learning*, *Time Series Forecasting* and AI powered *Recommendation Systems*.
125
  - 2️⃣ **Natural Language Processing** (NLP): This second section showcases AI applications where large amounts of text data is analyzed using Deep Learning models.
126
+ Pages on *Topic Modeling* and *Sentiment Analysis*, which are different kinds of NLP models, can be found in this section.
127
+ - 3️⃣ **Computer Vision**: This third section covers a sub-field of AI called Computer Vision, which deals with image/video data.
128
  The field of Computer Vision includes *Image classification* and *Object Detection*, which are both featured in this section.
129
+ - 🚀 **Go further**: In the final section, you will gain a deeper understanding of AI models and how they function.
130
+ The page features multiple models to try, as well as different datasets to train a model on.
131
  """)
132
 
133
  st.image("images/ML_domains.png",
134
+ caption="""This figure showcases a selection of sub-fields of AI, which includes
135
+ Machine Learning, NLP and Computer Vision.""")
136
 
137
 
138
  # st.markdown(" ")
pages/go_further.py CHANGED
@@ -43,11 +43,11 @@ def model_training(X, y, model_dict, _num_transformer=MinMaxScaler(),
43
  model_sklearn = KNeighborsClassifier(n_neighbors=param)
44
 
45
  if model == "Decision Tree 🌳":
46
- model_sklearn = DecisionTreeClassifier(max_depth=param)
47
  explainability = True
48
 
49
  if model == "Random Forest 🏕️":
50
- model_sklearn = RandomForestClassifier(max_depth=param)
51
  explainability = True
52
 
53
 
@@ -125,19 +125,15 @@ scores = np.diag(cm)
125
 
126
  st.image("images/ML_header.jpg")
127
  st.markdown("# Go further 🚀")
128
- st.markdown("""This page allows you to test and compare the results of different AI models, and gain a deeper understanding of how they function. <br>
129
  It includes three different types of **classification models** with Python code illustrations, as well as four datasets to choose from.
130
- """, unsafe_allow_html=True)
131
-
132
- # st.markdown("""**Reminder**: Classification models are AI models that are trained to predict a finite number of values/categories.
133
- # Examples can be found in the *Supervised vs Unsupervised* page with the credit score classification and customer churn prediction use cases.""")
134
-
135
- st.warning("""**Note**: Different types of models exists for most Machine Learning tasks.
136
- Models tend to vary in complexity and picking which one to train for a specific use case isn't always straightforward.
137
- Complex model might output better results but take longer to make predictions.
138
- The model selection step requires a good amount of testing by practitioners.""")
139
 
140
- st.markdown("""All of the classification models used in this page come from `scikit-learn`, which is a popular Data Science library in Python.""")
 
141
  try:
142
  st.link_button("Go to the scikit-learn website", "https://scikit-learn.org/stable/index.html")
143
  except:
@@ -155,17 +151,20 @@ st.markdown("""**Reminder**: Classification models are AI models that are traine
155
  st.markdown(" ")
156
  st.markdown(" ")
157
 
 
 
 
158
  ########################## SELECT A DATASET ###############################
159
 
160
  st.markdown("### Select a dataset 📋")
161
- st.markdown("""To perform the classification task, you can choose between three different datasets: **Wine quality**, **Titanic** and **Car evaluation**. <br>
162
  Each dataset will be shown in its original format and will go through pre-processing steps to insure its quality and usability for the chosen model.
163
  """, unsafe_allow_html=True)
164
 
165
  st.warning("""**Note:** The performance of a Machine Learning model is sensitive to the data being used to train it.
166
  Data cleaning and pre-processing are usually as important as training the AI model. These steps can include removing missing values, identifying outliers and transforming columns from text to numbers.""")
167
 
168
- select_data = st.selectbox("Choose an option", ["Wine quality 🍷", "Titanic 🚢", "Car evaluation 🚙", "Diabetes 👩‍⚕️"]) #label_visibility="collapsed")
169
  st.markdown(" ")
170
 
171
  if select_data =="Wine quality 🍷":
@@ -259,7 +258,7 @@ if select_data == "Car evaluation 🚙":
259
  - **Evaluation**: Evaluation level (unacceptable, acceptable)""")
260
 
261
 
262
- if select_data == "Diabetes 👩‍⚕️":
263
  # Load data and clean it
264
  data = load_data_csv(path_data, "diabetes.csv")
265
  data["Outcome"] = data["Outcome"].map({1:"Yes", 0:"No"})
@@ -299,6 +298,8 @@ st.markdown(" ")
299
  st.markdown(" ")
300
 
301
 
 
 
302
  ########################## SELECT A MODEL ###############################
303
 
304
  st.markdown("### Select a model 📚")
@@ -306,6 +307,11 @@ st.markdown("""You can choose between three types of classification models: **K
306
  For each model, you will be given a short explanation as to how they function.
307
  """, unsafe_allow_html=True)
308
 
 
 
 
 
 
309
  select_model = st.selectbox("**Choose an option**", ["K-nearest-neighbor 🏘️", "Decision Tree 🌳", "Random Forest 🏕️"])
310
  st.markdown(" ")
311
 
@@ -313,19 +319,20 @@ st.markdown(" ")
313
  if select_model == "K-nearest-neighbor 🏘️":
314
  #st.markdown("#### Model: K-nearest-neighbor")
315
  st.info("""**About the model**: K-nearest-neighbor (or KNN) is a type of classification model that uses neighboring points to classify new data.
316
- When trying to predict a class to new data points, the algorithm will look at points in close proximity (or in its neighborhood) to make a decision.
317
- The most common class among its neighborhood will then be assigned to the data point.""")
318
 
319
  select_param = 6
320
  model_dict = {"model":select_model, "param":select_param}
321
 
322
- learn_model = st.checkbox("Learn more", key="knn")
323
  if learn_model:
324
  st.markdown("""An important parameter in KNN algorithms is the number of points to choose as neighboors. <br>
325
  The image below shows two cases where the number of neighboors (k) are equal to 3 and 6.
326
- - When k is equal to 3, the most common class is **Classe B**. The red point will then be predicted as Classe B.
327
- - When k is equal to 6, the the most common class is **Classe A**. The red point will then be predicted as Classe A.""",
328
  unsafe_allow_html=True)
 
329
  st.image("images/knn.png", width=600)
330
  st.markdown("""K-nearest-neighbor algorithm are popular for their simplicity. <br>
331
  This can be a drawback for use cases/dataset that require a more complex approach to make accurate predictions.""", unsafe_allow_html=True)
@@ -339,15 +346,15 @@ if select_model == "Decision Tree 🌳":
339
  st.info("""**About the model**: Decision trees are classification model that split the prediction task into a succession of decisions, each with only two possible outcomes.
340
  These decisions can be visualized as a tree, with data points arriving from the top of the tree and landing at final "prediction regions".""")
341
 
342
- select_param = None
343
  model_dict = {"model":select_model, "param":select_param}
344
 
345
- learn_model = st.checkbox("Learn more", key="tree")
346
  if learn_model:
347
- st.markdown("""The following image showcases a decision tree that was built to predict whether a **bank should give out a loan** to a client. <br>
348
  The data used to train the model has each client's **age**, **salary** and **number of children**.""", unsafe_allow_html=True)
349
 
350
- st.markdown("""To predict whether a client gets a loan, the client's data goes through each 'question' in the tree and **gets assigned the class of the region it fell into**. <br>
351
  For example, a client that is under 30 years old and has a lower salary than 2500$ will not be awarded a loan by the model.""", unsafe_allow_html=True)
352
 
353
  st.image("images/decisiontree.png", width=800)
@@ -363,15 +370,15 @@ if select_model == "Decision Tree 🌳":
363
  if select_model == "Random Forest 🏕️":
364
  st.info("""**About the model:** Random Forest models generate multiple decision tree models to make predictions.
365
  The main drawback of decision trees is that their predictions can be unstable, meaning that their output often changes.
366
- Random Forest models aggregate the predictions of multiple decision trees to reduce this unstability and improve robustness.""")
367
 
368
- select_param = None
369
  model_dict = {"model":select_model, "param":select_param}
370
 
371
- learn_model = st.checkbox("Learn more", key="tree")
372
  if learn_model:
373
- st.markdown("""Random Forests classifiers aggregate results by apply **majority voting**, which means selecting the class that was most often predicted by trees as the final prediction.
374
- In the following image, the random forest model built four decision trees, who each have made their own final prediction. <br>"""
375
  , unsafe_allow_html=True)
376
 
377
  st.markdown("""Class C was predicted twice, whereas Class B et D where only predicted once. <br>
@@ -401,28 +408,36 @@ st.markdown(f"""You've selected the **{select_data}** dataset and the **{select_
401
 
402
 
403
  run_model = st.button("Run model", type="primary")
 
404
 
405
  if run_model:
406
  score, feature_imp, feature_names, labels = model_training(X, y, model_dict, _num_transformer=StandardScaler())
407
 
408
  if select_model in ["Decision Tree 🌳", "Random Forest 🏕️"]: # show explainability for decision tree, random firest
409
- tab1, tab2 = st.tabs(["Accuracy", "Explainability"])
410
 
411
  with tab1:
412
- if select_data == "Diabetes 👩‍⚕️":
413
- st.error("""**Important**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
 
 
 
414
  This small number of patient data explains why the model's performance isn't optimal.
415
- Additional data collection should be conducted to improve results, as well as hyperparameter tuning (see explanation after graph).""")
416
 
417
  score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
418
- fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)
419
  st.plotly_chart(fig, use_container_width=True)
420
-
421
  st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*.
422
  It consists of trying different combination of the model's parameters to maximise the accuracy score.
423
  Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""")
 
424
 
425
  with tab2:
 
 
 
426
 
427
  df_feature_imp = pd.DataFrame({"variable":feature_names, "importance":feature_imp})
428
  df_feature_imp = df_feature_imp.groupby("variable").mean().reset_index()
@@ -434,14 +449,16 @@ if run_model:
434
 
435
  else: # only show results for knn
436
  st.markdown("#### Results")
 
 
437
 
438
  st.markdown("""The K-nearest-neighbor algorithm doesn't have a built-in solution to compute model explainability with `scikit-learn`.
439
- You can use other python packages such as `SHAP` to compute explainability, which we didn't use here since they usually take a long time to output results.""")
440
 
441
- if select_data == "Diabetes 👩‍⚕️":
442
- st.error("""**Important**: Note that Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
443
  This small number of patient data explains why the model's performance isn't optimal.
444
- Additional data collection should be conducted to improve results, as well as hyperparameter tuning (see explanation after graph).""")
445
 
446
  score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
447
  fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)
@@ -458,3 +475,4 @@ if run_model:
458
 
459
 
460
 
 
 
43
  model_sklearn = KNeighborsClassifier(n_neighbors=param)
44
 
45
  if model == "Decision Tree 🌳":
46
+ model_sklearn = DecisionTreeClassifier(max_depth=param, class_weight="balanced")
47
  explainability = True
48
 
49
  if model == "Random Forest 🏕️":
50
+ model_sklearn = RandomForestClassifier(max_depth=param, )#class_weight="balanced_subsample")
51
  explainability = True
52
 
53
 
 
125
 
126
  st.image("images/ML_header.jpg")
127
  st.markdown("# Go further 🚀")
128
+ st.markdown("""This page allows you to test and compare results between different AI models, and gain a deeper understanding of how they make predictions. <br>
129
  It includes three different types of **classification models** with Python code illustrations, as well as four datasets to choose from.
130
+
131
+ **Explainability** is also given for most models.
132
+ These results give an indication on which variable had the most impact on the model's final prediction. <br>
133
+ Note that each model has its own way of measuring explainability, which makes comparisions between model explainabilities difficult.
 
 
 
 
 
134
 
135
+ All of the classification models used in this page come from `scikit-learn`, which is a popular Data Science library in Python.
136
+ """, unsafe_allow_html=True)
137
  try:
138
  st.link_button("Go to the scikit-learn website", "https://scikit-learn.org/stable/index.html")
139
  except:
 
151
  st.markdown(" ")
152
  st.markdown(" ")
153
 
154
+
155
+
156
+
157
  ########################## SELECT A DATASET ###############################
158
 
159
  st.markdown("### Select a dataset 📋")
160
+ st.markdown("""To perform the classification task, you can choose between three different datasets: **Titanic**, **Car evaluation**, **Wine quality** and **Diabetes prevention** <br>
161
  Each dataset will be shown in its original format and will go through pre-processing steps to insure its quality and usability for the chosen model.
162
  """, unsafe_allow_html=True)
163
 
164
  st.warning("""**Note:** The performance of a Machine Learning model is sensitive to the data being used to train it.
165
  Data cleaning and pre-processing are usually as important as training the AI model. These steps can include removing missing values, identifying outliers and transforming columns from text to numbers.""")
166
 
167
+ select_data = st.selectbox("Choose an option", ["Titanic 🚢", "Car evaluation 🚙", "Wine quality 🍷", "Diabetes prevention 👩‍⚕️"]) #label_visibility="collapsed")
168
  st.markdown(" ")
169
 
170
  if select_data =="Wine quality 🍷":
 
258
  - **Evaluation**: Evaluation level (unacceptable, acceptable)""")
259
 
260
 
261
+ if select_data == "Diabetes prevention 👩‍⚕️":
262
  # Load data and clean it
263
  data = load_data_csv(path_data, "diabetes.csv")
264
  data["Outcome"] = data["Outcome"].map({1:"Yes", 0:"No"})
 
298
  st.markdown(" ")
299
 
300
 
301
+
302
+
303
  ########################## SELECT A MODEL ###############################
304
 
305
  st.markdown("### Select a model 📚")
 
307
  For each model, you will be given a short explanation as to how they function.
308
  """, unsafe_allow_html=True)
309
 
310
+ st.warning("""**Note**: Different types of models exists for most Machine Learning tasks.
311
+ Models tend to vary in complexity and picking which one to train for a specific use case isn't always straightforward.
312
+ Complex model might output better results but take longer to make predictions.
313
+ The model selection step requires a good amount of testing by practitioners.""")
314
+
315
  select_model = st.selectbox("**Choose an option**", ["K-nearest-neighbor 🏘️", "Decision Tree 🌳", "Random Forest 🏕️"])
316
  st.markdown(" ")
317
 
 
319
  if select_model == "K-nearest-neighbor 🏘️":
320
  #st.markdown("#### Model: K-nearest-neighbor")
321
  st.info("""**About the model**: K-nearest-neighbor (or KNN) is a type of classification model that uses neighboring points to classify new data.
322
+ When trying to predict a class to new data point, the algorithm will look at points in close proximity (or in its neighborhood) to make a decision.
323
+ The most common class in the points' neighborhood will then be chosen as the final prediction.""")
324
 
325
  select_param = 6
326
  model_dict = {"model":select_model, "param":select_param}
327
 
328
+ learn_model = st.checkbox("Learn more about the model", key="knn")
329
  if learn_model:
330
  st.markdown("""An important parameter in KNN algorithms is the number of points to choose as neighboors. <br>
331
  The image below shows two cases where the number of neighboors (k) are equal to 3 and 6.
332
+ - When k is equal to 3 (the small dotted circle in the image below), the most common class is **Class B**. The red point will then be predicted as Classe B.
333
+ - When k is equal to 6 (the large dotted circle in the image below), the the most common class is **Class A**. The red point will then be predicted as Classe A.""",
334
  unsafe_allow_html=True)
335
+
336
  st.image("images/knn.png", width=600)
337
  st.markdown("""K-nearest-neighbor algorithm are popular for their simplicity. <br>
338
  This can be a drawback for use cases/dataset that require a more complex approach to make accurate predictions.""", unsafe_allow_html=True)
 
346
  st.info("""**About the model**: Decision trees are classification model that split the prediction task into a succession of decisions, each with only two possible outcomes.
347
  These decisions can be visualized as a tree, with data points arriving from the top of the tree and landing at final "prediction regions".""")
348
 
349
+ select_param = 8
350
  model_dict = {"model":select_model, "param":select_param}
351
 
352
+ learn_model = st.checkbox("Learn more about the model", key="tree")
353
  if learn_model:
354
+ st.markdown("""The following image showcases a decision tree which predicts whether a **bank should give out a loan** to a client. <br>
355
  The data used to train the model has each client's **age**, **salary** and **number of children**.""", unsafe_allow_html=True)
356
 
357
+ st.markdown("""To predict whether a client gets a loan, the client's data goes through each 'leaf' in the tree (leaves are the blue box question in the image below) and **gets assigned the class of the final leaf it fell into** (either Get loan or Don't get loan).
358
  For example, a client that is under 30 years old and has a lower salary than 2500$ will not be awarded a loan by the model.""", unsafe_allow_html=True)
359
 
360
  st.image("images/decisiontree.png", width=800)
 
370
  if select_model == "Random Forest 🏕️":
371
  st.info("""**About the model:** Random Forest models generate multiple decision tree models to make predictions.
372
  The main drawback of decision trees is that their predictions can be unstable, meaning that their output often changes.
373
+ Random Forest models combine the predictions of multiple decision trees to reduce this unstability and improve robustness.""")
374
 
375
+ select_param = 8
376
  model_dict = {"model":select_model, "param":select_param}
377
 
378
+ learn_model = st.checkbox("Learn more about the model", key="tree")
379
  if learn_model:
380
+ st.markdown("""Random Forests classifiers combine the results of multiple trees by apply **majority voting**, which means selecting the class that was most often predicted by trees as the final prediction.
381
+ In the following image, the random forest model built four decision trees, who each have made their own class prediction. <br>"""
382
  , unsafe_allow_html=True)
383
 
384
  st.markdown("""Class C was predicted twice, whereas Class B et D where only predicted once. <br>
 
408
 
409
 
410
  run_model = st.button("Run model", type="primary")
411
+ st.markdown(" ")
412
 
413
  if run_model:
414
  score, feature_imp, feature_names, labels = model_training(X, y, model_dict, _num_transformer=StandardScaler())
415
 
416
  if select_model in ["Decision Tree 🌳", "Random Forest 🏕️"]: # show explainability for decision tree, random firest
417
+ tab1, tab2 = st.tabs(["Results", "Explainability"])
418
 
419
  with tab1:
420
+ st.markdown("#### Results")
421
+ st.markdown("""The values below represent the model's accuracy for each possible class.
422
+ The lowest possible accuracy is 0 and the highest 100.""")
423
+ if select_data == "Diabetes prevention 👩‍⚕️":
424
+ st.warning("""**Note**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
425
  This small number of patient data explains why the model's performance isn't optimal.
426
+ Additional data collection as well as hyperparameter tuning can be conducted to improve results.""")
427
 
428
  score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
429
+ fig = px.bar(score_df, x="label", y="accuracy", color="label", text_auto=True)
430
  st.plotly_chart(fig, use_container_width=True)
431
+
432
  st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*.
433
  It consists of trying different combination of the model's parameters to maximise the accuracy score.
434
  Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""")
435
+
436
 
437
  with tab2:
438
+ st.markdown("#### Explainability")
439
+ st.markdown("""Variables with a high explainability score had the most impact on the model's predictions.
440
+ Variables with a low explainability score had a much smaller impact.""")
441
 
442
  df_feature_imp = pd.DataFrame({"variable":feature_names, "importance":feature_imp})
443
  df_feature_imp = df_feature_imp.groupby("variable").mean().reset_index()
 
449
 
450
  else: # only show results for knn
451
  st.markdown("#### Results")
452
+ st.markdown("""The values below represent the model's accuracy for each possible class.
453
+ The lowest possible accuracy is 0 and the highest 100.""")
454
 
455
  st.markdown("""The K-nearest-neighbor algorithm doesn't have a built-in solution to compute model explainability with `scikit-learn`.
456
+ You can use other python packages such as `SHAP` to compute explainability, which we didn't use here since they usually take a long time to output results.""")
457
 
458
+ if select_data == "Diabetes prevention 👩‍⚕️":
459
+ st.warning("""**Note**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
460
  This small number of patient data explains why the model's performance isn't optimal.
461
+ Additional data collection as well as hyperparameter tuning can be conducted to improve results.""")
462
 
463
  score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
464
  fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)
 
475
 
476
 
477
 
478
+
pages/object_detection.py CHANGED
@@ -170,10 +170,10 @@ st.divider()
170
  st.markdown("# Fashion Object Detection 👗")
171
  # st.info("""This use case showcases the application of **Object detection** to detect clothing items/features on images. <br>
172
  # The images used were gathered from Dior's""")
173
- st.info("""**Object detection models** can very valuable for fashion retailers wishing to improve customer experience by providing, for example, **product recognition**, **visual search**
174
- and even **virtual try-ons**.
175
- In this use case, we are going to show an object detection model that as able to identify and locate different articles of clothings on fashipn show images.
176
- """)
177
 
178
  st.markdown(" ")
179
  st.markdown(" ")
@@ -194,8 +194,8 @@ st.markdown(" ")
194
 
195
 
196
  st.markdown("### About the model 📚")
197
- st.markdown("""The object detection model was trained specifically to **detect clothing items** on images. <br>
198
- It is able to detect <b>46</b> different types of clothing items.""", unsafe_allow_html=True)
199
 
200
  colors = ["#8ef", "#faa", "#afa", "#fea", "#8ef","#afa"]*7 + ["#8ef", "#faa", "#afa", "#fea"]
201
 
@@ -209,7 +209,7 @@ annotated_text([cats_annotated])
209
  # 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet',
210
  # 'ruffle', 'sequin', 'tassel'""", unsafe_allow_html=True)
211
 
212
- st.markdown("Credits: https://huggingface.co/valentinafeve/yolos-fashionpedia")
213
  st.markdown("")
214
  st.markdown("")
215
 
@@ -294,6 +294,7 @@ dict_cats_final = {key:value for (key,value) in dict_cats.items() if value in se
294
  st.markdown("### Define a threshold for predictions 🔎")
295
  st.markdown("""In this section, you can select a threshold for the model's final predictions. <br>
296
  Objects that are given a lower score than the chosen threshold will be ignored in the final results""", unsafe_allow_html=True)
 
297
  st.info("""**Note**: Object detection models detect objects using bounding boxes as well as assign objects to specific classes.
298
  Each object is given a class based on a probability score computed by the model. A high probability signals that the model is confident in its prediction.
299
  On the contrary, a lower probability score signals a level of uncertainty.""")
 
170
  st.markdown("# Fashion Object Detection 👗")
171
  # st.info("""This use case showcases the application of **Object detection** to detect clothing items/features on images. <br>
172
  # The images used were gathered from Dior's""")
173
+ st.info("""**Object detection models** can very valuable for fashion retailers wishing to improve customer experience. They can provide, for example, **product recognition**, **visual search**
174
+ and even **virtual try-ons**.""")
175
+
176
+ st.markdown("In this use case, we are going to show an object detection model that as able to identify and locate different articles of clothings on fashion show images.")
177
 
178
  st.markdown(" ")
179
  st.markdown(" ")
 
194
 
195
 
196
  st.markdown("### About the model 📚")
197
+ st.markdown("""The object detection model was trained to **detect specific clothing items** on images. <br>
198
+ Below is a list of the <b>46</b> different types of clothing items the model can identify and locate.""", unsafe_allow_html=True)
199
 
200
  colors = ["#8ef", "#faa", "#afa", "#fea", "#8ef","#afa"]*7 + ["#8ef", "#faa", "#afa", "#fea"]
201
 
 
209
  # 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet',
210
  # 'ruffle', 'sequin', 'tassel'""", unsafe_allow_html=True)
211
 
212
+ st.markdown("Credits for the model: https://huggingface.co/valentinafeve/yolos-fashionpedia")
213
  st.markdown("")
214
  st.markdown("")
215
 
 
294
  st.markdown("### Define a threshold for predictions 🔎")
295
  st.markdown("""In this section, you can select a threshold for the model's final predictions. <br>
296
  Objects that are given a lower score than the chosen threshold will be ignored in the final results""", unsafe_allow_html=True)
297
+
298
  st.info("""**Note**: Object detection models detect objects using bounding boxes as well as assign objects to specific classes.
299
  Each object is given a class based on a probability score computed by the model. A high probability signals that the model is confident in its prediction.
300
  On the contrary, a lower probability score signals a level of uncertainty.""")
pages/recommendation_system.py CHANGED
@@ -26,7 +26,7 @@ st.markdown("### What is a Recommendation System ?")
26
  st.info("""**Recommendation systems** are algorithms built to **suggest** or **recommend** **products** to consumers.
27
  They are very common in social media platforms such as TikTok, Youtube or Instagram or e-commerce websites as they help improve and personalize a consumer's experience.""")
28
 
29
- st.markdown("""There are two methods to build recommendation systems:
30
  - **Content-based filtering**: Recommendations are made based on the user's own preferences
31
  - **Collaborative filtering**: Recommendations are made based on the preferences and behavior of similar users""", unsafe_allow_html=True)
32
 
 
26
  st.info("""**Recommendation systems** are algorithms built to **suggest** or **recommend** **products** to consumers.
27
  They are very common in social media platforms such as TikTok, Youtube or Instagram or e-commerce websites as they help improve and personalize a consumer's experience.""")
28
 
29
+ st.markdown("""There are two main types of recommendation systems:
30
  - **Content-based filtering**: Recommendations are made based on the user's own preferences
31
  - **Collaborative filtering**: Recommendations are made based on the preferences and behavior of similar users""", unsafe_allow_html=True)
32
 
pages/supervised_unsupervised_page.py CHANGED
@@ -26,8 +26,9 @@ st.set_page_config(layout="wide")
26
  #st.image("images/ML_header.jpg", use_column_width=True)
27
  st.markdown("# Supervised vs Unsupervised Learning 🔍")
28
 
29
- st.info("""There are two main types of models in the field of Data Science, **Supervised** and **Unsupervised learning** models.
30
- Being able to distinguish which type of model fits your data is an essential step in building any AI project.""")
 
31
 
32
  st.markdown(" ")
33
  #st.markdown("## What are the differences between both ?")
@@ -38,7 +39,7 @@ with col1:
38
  st.markdown("## Supervised Learning")
39
  st.markdown("""Supervised learning models are trained by learning from **labeled data**. <br>
40
  Labeled data provides to the model the desired output, which it will then use to learn relevant patterns and make predictions.
41
- - A model is first **trained** to make predictions using labeled data.
42
  - The trained model can then be used to **predict values** for new data.
43
  """, unsafe_allow_html=True)
44
  st.markdown(" ")
@@ -57,7 +58,7 @@ with col2:
57
 
58
  st.markdown(" ")
59
 
60
- learning_type = st.selectbox("**Select a type of model**",
61
  ["Supervised Learning",
62
  "Unsupervised Learning"])
63
 
@@ -91,8 +92,11 @@ if learning_type == "Supervised Learning":
91
  ## Description of the use case
92
  st.divider()
93
  st.markdown("# Credit score classification 💯")
94
- st.info("""**Classification** is a type of supervised learning where the goal is to categorize input data into predefined classes or categories.
95
- In this case, we will build a **credit score classification** model that predicts if a client will have a **'Bad'**, **'Standard'** or **'Good'** credit score.""")
 
 
 
96
  st.markdown(" ")
97
 
98
  _, col, _ = st.columns([0.25,0.5,0.25])
@@ -101,7 +105,7 @@ if learning_type == "Supervised Learning":
101
 
102
  ## Learn about the data
103
  st.markdown("#### About the data 📋")
104
- st.markdown("""To train the credit classification model, you were provided a **labeled** database with the bank and credit-related information of around 7600 clients. <br>
105
  This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Credit_Score** variable.""",
106
  unsafe_allow_html=True)
107
 
@@ -350,9 +354,12 @@ if learning_type == "Supervised Learning":
350
  ## Description of the use case
351
  st.divider()
352
  st.markdown("# Customer churn prediction ❌")
353
- st.info(""" Classification is a type of supervised learning model whose goal is to categorize input data into predefined classes or categories.
354
- In this example, we will build a **customer churn classification model** that can predict whether a customer is likely to leave a company's service in the future using historical data.
355
- """)
 
 
 
356
 
357
  st.markdown(" ")
358
 
@@ -367,8 +374,8 @@ if learning_type == "Supervised Learning":
367
 
368
  ## Learn about the data
369
  st.markdown("#### About the data 📋")
370
- st.markdown("""To train the customer churn classification model, you were provided a **labeled** database with around 7000 clients of a telecommunications company. <br>
371
- The data contains information on which services the customer has signed for, information on his account as well as whether the customer churned or not (our label here).""",
372
  unsafe_allow_html=True)
373
  # st.markdown("This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Churn** variable.")
374
  st.info("**Note**: The variables that had two possible values (Yes or No) where transformed into binary variables (0 or 1) with 0 being 'No' and 1 being 'Yes'.")
@@ -660,7 +667,7 @@ def markdown_general_info(df):
660
 
661
  if learning_type == "Unsupervised Learning":
662
  usl_usecase = st.selectbox("**Choose a use case**",
663
- ["Customer segmentation 🧑‍🤝‍🧑"])
664
 
665
 
666
  #################################### CUSTOMER SEGMENTATION ##################################
@@ -668,16 +675,16 @@ if learning_type == "Unsupervised Learning":
668
  path_clustering = r"data/clustering"
669
  path_clustering_results = r"data/clustering/results"
670
 
671
- if usl_usecase == "Customer segmentation 🧑‍🤝‍🧑":
672
 
673
  # st.divider()
674
  st.divider()
675
- st.markdown("# Customer Segmentation 🧑‍🤝‍🧑")
676
 
677
- st.info("""**Unsupervised learning** models are valulable tools for cases where you want your model to discover patterns by itself, without having to give it examples to learn from (especially if you don't have labeled data).
678
- In this use case, we will show how they can be useful for **Customer Segmentation** to detect unknown groups of clients in a company's customer base.
679
- Using this previously unknown segmentation, companies can then create more targeted add campaigns based on their consumer's behavior and preferences.
680
- """)
681
  st.markdown(" ")
682
 
683
  ## Show image
@@ -726,13 +733,14 @@ if learning_type == "Unsupervised Learning":
726
  st.info("""**Clustering** is a type of unsupervised learning method that learns how to group similar data points together into "clusters", without needing supervision.
727
  In our case, a data points represents a customer that will be assigned to an unknown group.""")
728
 
729
- st.markdown("""
730
- - The clustering algorithm used in this use case allows a specific number of groups to be identified, which isn't the case for all clustering models.
731
- - The number of clusters chosen by the user can have a strong impact on the quality of the segmentation. Try to run the model multiple times with different number of clusters and see which number leads to groups with more distinct customer behaviors/preferences.""")
732
  st.markdown(" ")
733
  st.markdown("Here is an example of grouped data using a clustering model.")
734
  st.image("images/clustering.webp")
735
 
 
736
 
737
  nb_groups = st.selectbox("Choose a number of customer groups to identify", np.arange(2,6))
738
  df_results = load_data_pickle(path_clustering_results, f"results_{nb_groups}_clusters.pkl")
 
26
  #st.image("images/ML_header.jpg", use_column_width=True)
27
  st.markdown("# Supervised vs Unsupervised Learning 🔍")
28
 
29
+ st.info("""Data Science models are often split into two categories: **Supervised** and **Unsupervised Learning**.
30
+ The goal of this page is to present these two kinds of Data Science models, as well as give you multiple use cases to try them with.
31
+ Note that other kinds of AI models exist such as Reinforcement Learning or Federated Learning, which we won't cover in this app.""")
32
 
33
  st.markdown(" ")
34
  #st.markdown("## What are the differences between both ?")
 
39
  st.markdown("## Supervised Learning")
40
  st.markdown("""Supervised learning models are trained by learning from **labeled data**. <br>
41
  Labeled data provides to the model the desired output, which it will then use to learn relevant patterns and make predictions.
42
+ - A model is first **trained** to make predictions using labeled data, which doesn't contain the desired output.
43
  - The trained model can then be used to **predict values** for new data.
44
  """, unsafe_allow_html=True)
45
  st.markdown(" ")
 
58
 
59
  st.markdown(" ")
60
 
61
+ learning_type = st.selectbox("**Select an AI task**",
62
  ["Supervised Learning",
63
  "Unsupervised Learning"])
64
 
 
92
  ## Description of the use case
93
  st.divider()
94
  st.markdown("# Credit score classification 💯")
95
+ st.info("""**Classification models** are supervised learning models whose goal is to categorize data into predefined categories.
96
+ As opposed to unsupervised learning models, these categories are known beforehand.
97
+ Other types of supervised learning models include Regression models, which learn how to predict numerical values, instead of a set number of categories.""")
98
+
99
+ st.markdown("In this use case, we will build a **credit score classification model** which predicts whether a client has a 'Bad', 'Standard', or 'Good' credit score.")
100
  st.markdown(" ")
101
 
102
  _, col, _ = st.columns([0.25,0.5,0.25])
 
105
 
106
  ## Learn about the data
107
  st.markdown("#### About the data 📋")
108
+ st.markdown("""To train the credit classification model, you were provided a **labeled** database with 7600 clients and containing bank and credit-related client information. <br>
109
  This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Credit_Score** variable.""",
110
  unsafe_allow_html=True)
111
 
 
354
  ## Description of the use case
355
  st.divider()
356
  st.markdown("# Customer churn prediction ❌")
357
+
358
+ st.info("""**Classification models** are supervised learning models whose goal is to categorize data into predefined categories.
359
+ As opposed to unsupervised learning models, these categories are known beforehand.
360
+ Other types of supervised learning models include Regression models, which learn how to predict numerical values, instead of a set number of categories.""")
361
+
362
+ st.markdown("For this use case, we will build a **customer churn classification model** that can predict whether a person will stop being a customer using historical data.")
363
 
364
  st.markdown(" ")
365
 
 
374
 
375
  ## Learn about the data
376
  st.markdown("#### About the data 📋")
377
+ st.markdown("""To train the customer churn model, you were provided a **labeled** database with around 7000 clients of a telecommunications company. <br>
378
+ The data contains information on which services the customer has signed for, account information as well as whether the customer churned or not (our label here).""",
379
  unsafe_allow_html=True)
380
  # st.markdown("This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Churn** variable.")
381
  st.info("**Note**: The variables that had two possible values (Yes or No) where transformed into binary variables (0 or 1) with 0 being 'No' and 1 being 'Yes'.")
 
667
 
668
  if learning_type == "Unsupervised Learning":
669
  usl_usecase = st.selectbox("**Choose a use case**",
670
+ ["Customer segmentation (clustering) 🧑‍🤝‍🧑"])
671
 
672
 
673
  #################################### CUSTOMER SEGMENTATION ##################################
 
675
  path_clustering = r"data/clustering"
676
  path_clustering_results = r"data/clustering/results"
677
 
678
+ if usl_usecase == "Customer segmentation (clustering) 🧑‍🤝‍🧑":
679
 
680
  # st.divider()
681
  st.divider()
682
+ st.markdown("# Customer Segmentation (clustering) 🧑‍🤝‍🧑")
683
 
684
+ st.markdown("""In this use case, we will use a clustering model, a type of Unsupervised Learning model, to perform **Customer Segmentation**. <br>
685
+ Our model will allow similar groups of clients to be identified within company's consumer database based on consumer habits and caracteristics.
686
+ """, unsafe_allow_html=True)
687
+
688
  st.markdown(" ")
689
 
690
  ## Show image
 
733
  st.info("""**Clustering** is a type of unsupervised learning method that learns how to group similar data points together into "clusters", without needing supervision.
734
  In our case, a data points represents a customer that will be assigned to an unknown group.""")
735
 
736
+ # st.markdown("""
737
+ # - The clustering algorithm used in this use case allows a specific number of groups to be identified, which isn't the case for all clustering models.""")
738
+
739
  st.markdown(" ")
740
  st.markdown("Here is an example of grouped data using a clustering model.")
741
  st.image("images/clustering.webp")
742
 
743
+ st.warning("**Note**: The number of clusters chosen by the user can have a strong impact on the quality of the segmentation. Try to run the model multiple times with different number of clusters and see which number leads to groups with more distinct customer behaviors/preferences.")
744
 
745
  nb_groups = st.selectbox("Choose a number of customer groups to identify", np.arange(2,6))
746
  df_results = load_data_pickle(path_clustering_results, f"results_{nb_groups}_clusters.pkl")
pages/timeseries_analysis.py CHANGED
@@ -35,7 +35,7 @@ def forecast_prophet(train, test, col=None):
35
  st.markdown("# Time Series Forecasting 📈")
36
 
37
  st.markdown("### What is Time Series Forecasting ?")
38
- st.info("""Time series forecasting models are AI models built to make accurate predictions about future values using historical data.
39
  These types of models take into account temporal patterns, such as **trends** (long-term movements), **seasonality** (repeating patterns at fixed intervals), and **cyclic patterns** (repeating patterns not necessarily at fixed intervals)""")
40
  #unsafe_allow_html=True)
41
 
@@ -77,8 +77,8 @@ st.divider()
77
  st.markdown("# Power Consumption Forecasting ⚡")
78
 
79
  #st.markdown(" ")
80
- st.info("""In this use case, a time series forecasting model is used to predict the **energy consumption** (or **Global Active Power**) of a household using historical data.
81
- A forecasting model can be a valuable tool to optimize resource planning and avoid overloads during peak demand periods.""")
82
 
83
  st.markdown(" ")
84
 
 
35
  st.markdown("# Time Series Forecasting 📈")
36
 
37
  st.markdown("### What is Time Series Forecasting ?")
38
+ st.info("""Time series forecasting models are AI models built to make predictions about future values using historical data.
39
  These types of models take into account temporal patterns, such as **trends** (long-term movements), **seasonality** (repeating patterns at fixed intervals), and **cyclic patterns** (repeating patterns not necessarily at fixed intervals)""")
40
  #unsafe_allow_html=True)
41
 
 
77
  st.markdown("# Power Consumption Forecasting ⚡")
78
 
79
  #st.markdown(" ")
80
+ st.info("""In this use case, a time series forecasting model learns how to accuratly predict the **energy consumption** (or global active power in the dataset) of a household using historical data.
81
+ A forecasting model can be a valuable tool for energy consumption analysis as it can help **optimize resource planning** and **avoid overloads** during peak demand periods.""")
82
 
83
  st.markdown(" ")
84
 
pages/topic_modeling.py CHANGED
@@ -41,7 +41,7 @@ st.markdown(" ")
41
  st.divider()
42
 
43
  st.markdown("# Topic modeling on product descriptions 🛍️")
44
- st.info("""In this use case, we will use a topic model to categorize around 20 000 e-commerce products using text descriptions and identify
45
  the main types of products solds.""")
46
 
47
  _, col, _ = st.columns([0.2,0.6,0.2])
 
41
  st.divider()
42
 
43
  st.markdown("# Topic modeling on product descriptions 🛍️")
44
+ st.markdown("""In this use case, we will use a **topic model** to categorize around **20 000 e-commerce products** as well as identify
45
  the main types of products solds.""")
46
 
47
  _, col, _ = st.columns([0.2,0.6,0.2])