Spaces:

beihai
/

LightGBM-parameter-tuning

Runtime error

App Files Files Community

tjxj commited on Apr 4, 2022

Commit

7fb7db7

•

1 Parent(s): f83439d

1.0

Browse files

Files changed (7) hide show

.streamlit/config.toml +2 -0
LICENSE +201 -0
LightGBM 调参.md +402 -0
app.py +135 -0
definitions.py +24 -0
git.sh +3 -0
requirements.txt +6 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [deprecation]
2	+ showPyplotGlobalUse = False

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

LightGBM 调参.md ADDED Viewed

	@@ -0,0 +1,402 @@

+**Step1. 学习率和估计器及其数目**
+不管怎么样，我们先把学习率先定一个较高的值，这里取 `learning_rate = 0.1`，其次确定估计器`boosting/boost/boosting_type`的类型，不过默认都会选`gbdt`。
+为了确定估计器的数目，也就是boosting迭代的次数，也可以说是残差树的数目，参数名为`n_estimators/num_iterations/num_round/num_boost_round`。我们可以先将该参数设成一个较大的数，然后在cv结果中查看最优的迭代次数，具体如代码。
+在这之前，我们必须给其他重要的参数一个初始值。初始值的意义不大，只是为了方便确定其他参数。下面先给定一下初始值：
+以下参数根据具体项目要求定：
+```
+'boosting_type'/'boosting': 'gbdt'
+'objective': 'regression'
+'metric': 'rmse'
+```
+以下参数我选择的初始值，你可以根据自己的情况来选择：
+```
+'max_depth': 6     ###   根据问题来定咯，由于我的数据集不是很大，所以选择了一个适中的值，其实4-10都无所谓。
+'num_leaves': 50  ###   由于lightGBM是leaves_wise生长，官方说法是要小于2^max_depth
+'subsample'/'bagging_fraction':0.8           ###  数据采样
+'colsample_bytree'/'feature_fraction': 0.8  ###  特征采样
+```
+下面我是用LightGBM的cv函数进行演示：
+```
+params = {
+    'boosting_type': 'gbdt',
+    'objective': 'regression',
+    'learning_rate': 0.1,
+    'num_leaves': 50,
+    'max_depth': 6,
+    'subsample': 0.8,
+    'colsample_bytree': 0.8,
+    }
+data_train = lgb.Dataset(df_train, y_train, silent=True)
+cv_results = lgb.cv(
+    params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
+    early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
+print('best n_estimators:', len(cv_results['rmse-mean']))
+print('best cv score:', cv_results['rmse-mean'][-1])
+[50]    cv_agg's rmse: 1.38497 + 0.0202823
+best n_estimators: 43
+best cv score: 1.3838664241
+```
+由于我的数据集不是很大，所以在学习率为0.1时，最优的迭代次数只有43。那么现在，我们就代入(0.1, 43)进入其他参数的tuning。但是还是建议，在硬件条件允许的条件下，学习率还是越小越好。
+**Step2. max_depth 和 num_leaves**
+这是提高精确度的最重要的参数。
+`max_depth` ：设置树深度，深度越大可能过拟合
+`num_leaves`：因为 LightGBM 使用的是 leaf-wise 的算法，因此在调节树的复杂程度时，使用的是 num_leaves 而不是 max_depth。大致换算关系：num_leaves = 2^(max_depth)，但是它的值的设置应该小于 2^(max_depth)，否则可能会导致过拟合。
+我们也可以同时调节这两个参数，对于这两个参数调优，我们先粗调，再细调：
+这里我们引入`sklearn`里的`GridSearchCV()`函数进行搜索。不知道怎的，这个函数特别耗内存，特别耗时间，特别耗精力。
+```
+from sklearn.model_selection import GridSearchCV
+### 我们可以创建lgb的sklearn模型，使用上面选择的(学习率，评估器数目)
+model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
+                              learning_rate=0.1, n_estimators=43, max_depth=6,
+                              metric='rmse', bagging_fraction = 0.8,feature_fraction = 0.8)
+params_test1={
+    'max_depth': range(3,8,2),
+    'num_leaves':range(50, 170, 30)
+}
+gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
+gsearch1.fit(df_train, y_train)
+gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
+Fitting 5 folds for each of 12 candidates, totalling 60 fits
+[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.0min
+[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:  3.1min finished
+([mean: -1.88629, std: 0.13750, params: {'max_depth': 3, 'num_leaves': 50},
+  mean: -1.88629, std: 0.13750, params: {'max_depth': 3, 'num_leaves': 80},
+  mean: -1.88629, std: 0.13750, params: {'max_depth': 3, 'num_leaves': 110},
+  mean: -1.88629, std: 0.13750, params: {'max_depth': 3, 'num_leaves': 140},
+  mean: -1.86917, std: 0.12590, params: {'max_depth': 5, 'num_leaves': 50},
+  mean: -1.86917, std: 0.12590, params: {'max_depth': 5, 'num_leaves': 80},
+  mean: -1.86917, std: 0.12590, params: {'max_depth': 5, 'num_leaves': 110},
+  mean: -1.86917, std: 0.12590, params: {'max_depth': 5, 'num_leaves': 140},
+  mean: -1.89254, std: 0.10904, params: {'max_depth': 7, 'num_leaves': 50},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 80},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 110},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 140}],
+ {'max_depth': 7, 'num_leaves': 80},
+ -1.8602436718814157)
+```
+这里，我们运行了12个参数组合，得到的最优解是在max_depth为7，num_leaves为80的情况下，分数为-1.860。
+这里必须说一下，sklearn模型评估里的scoring参数都是采用的**higher return values are better than lower return values（较高的返回值优于较低的返回值）**。
+但是，我采用的metric策略采用的是均方误差(rmse)，越低越好，所以sklearn就提供了`neg_mean_squared_erro`参数，也就是返回metric的负数，所以就均方差来说，也就变成负数越大越好了。
+所以，可以看到，最优解的分数为-1.860，转化为均方差为np.sqrt(-(-1.860)) = 1.3639，明显比step1的分数要好很多。
+至此，我们将我们这步得到的最优解代入第三步。其实，我这里只进行了粗调，如果要得到更好的效果，可以将max_depth在7附近多取几个值，num_leaves在80附近多取几个值。千万不要怕麻烦，虽然这确实很麻烦。
+```
+params_test2={
+    'max_depth': [6,7,8],
+    'num_leaves':[68,74,80,86,92]
+}
+gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
+gsearch2.fit(df_train, y_train)
+gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
+Fitting 5 folds for each of 15 candidates, totalling 75 fits
+[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.8min
+[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:  5.1min finished
+([mean: -1.87506, std: 0.11369, params: {'max_depth': 6, 'num_leaves': 68},
+  mean: -1.87506, std: 0.11369, params: {'max_depth': 6, 'num_leaves': 74},
+  mean: -1.87506, std: 0.11369, params: {'max_depth': 6, 'num_leaves': 80},
+  mean: -1.87506, std: 0.11369, params: {'max_depth': 6, 'num_leaves': 86},
+  mean: -1.87506, std: 0.11369, params: {'max_depth': 6, 'num_leaves': 92},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 68},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 74},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 80},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 86},
+  mean: -1.86024, std: 0.11364, params: {'max_depth': 7, 'num_leaves': 92},
+  mean: -1.88197, std: 0.11295, params: {'max_depth': 8, 'num_leaves': 68},
+  mean: -1.89117, std: 0.12686, params: {'max_depth': 8, 'num_leaves': 74},
+  mean: -1.86390, std: 0.12259, params: {'max_depth': 8, 'num_leaves': 80},
+  mean: -1.86733, std: 0.12159, params: {'max_depth': 8, 'num_leaves': 86},
+  mean: -1.86665, std: 0.12174, params: {'max_depth': 8, 'num_leaves': 92}],
+ {'max_depth': 7, 'num_leaves': 68},
+ -1.8602436718814157)
+```
+可见最大深度7是没问题的，但是看细节的话，发现在最大深度为7的情况下，叶结点的数量对分数并没有影响。
+**Step3: min_data_in_leaf 和 min_sum_hessian_in_leaf**
+说到这里，就该降低过拟合了。
+`min_data_in_leaf` 是一个很重要的参数, 也叫min_child_samples，它的值取决于训练数据的样本个树和num_leaves. 将其设置的较大可以避免生成一个过深的树, 但有可能导致欠拟合。
+`min_sum_hessian_in_leaf`：也叫min_child_weight，使一个结点分裂的最小海森值之和，真拗口（Minimum sum of hessians in one leaf to allow a split. Higher values potentially decrease overfitting）
+我们采用跟上面相同的方法进行：
+```
+params_test3={
+    'min_child_samples': [18, 19, 20, 21, 22],
+    'min_child_weight':[0.001, 0.002]
+}
+model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
+                              learning_rate=0.1, n_estimators=43, max_depth=7,
+                              metric='rmse', bagging_fraction = 0.8, feature_fraction = 0.8)
+gsearch3 = GridSearchCV(estimator=model_lgb, param_grid=params_test3, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
+gsearch3.fit(df_train, y_train)
+gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
+Fitting 5 folds for each of 10 candidates, totalling 50 fits
+[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.9min
+[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  3.3min finished
+([mean: -1.88057, std: 0.13948, params: {'min_child_samples': 18, 'min_child_weight': 0.001},
+  mean: -1.88057, std: 0.13948, params: {'min_child_samples': 18, 'min_child_weight': 0.002},
+  mean: -1.88365, std: 0.13650, params: {'min_child_samples': 19, 'min_child_weight': 0.001},
+  mean: -1.88365, std: 0.13650, params: {'min_child_samples': 19, 'min_child_weight': 0.002},
+  mean: -1.86024, std: 0.11364, params: {'min_child_samples': 20, 'min_child_weight': 0.001},
+  mean: -1.86024, std: 0.11364, params: {'min_child_samples': 20, 'min_child_weight': 0.002},
+  mean: -1.86980, std: 0.14251, params: {'min_child_samples': 21, 'min_child_weight': 0.001},
+  mean: -1.86980, std: 0.14251, params: {'min_child_samples': 21, 'min_child_weight': 0.002},
+  mean: -1.86750, std: 0.13898, params: {'min_child_samples': 22, 'min_child_weight': 0.001},
+  mean: -1.86750, std: 0.13898, params: {'min_child_samples': 22, 'min_child_weight': 0.002}],
+ {'min_child_samples': 20, 'min_child_weight': 0.001},
+ -1.8602436718814157)
+```
+这是我经过粗调后细调的结果，可以看到，min_data_in_leaf的最优值为20，而min_sum_hessian_in_leaf对最后的值几乎没有影响。且这里调参之后，最后的值没有进行优化，说明之前的默认值即为20，0.001。
+**Step4: feature_fraction 和 bagging_fraction**
+这两个参数都是为了降低过拟合的。
+feature_fraction参数来进行特征的子抽样。这个参数可以用来防止过拟合及提高训练速度。
+bagging_fraction+bagging_freq参数必须同时设置，bagging_fraction相当于subsample样本采样，可以使bagging更快的运行，同时也可以降拟合。bagging_freq默认0，表示bagging的频率，0意味着没有使用bagging，k意味着每k轮迭代进行一次bagging。
+不同的参数，同样的方法。
+```
+params_test4={
+    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
+    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
+}
+model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
+                              learning_rate=0.1, n_estimators=43, max_depth=7,
+                              metric='rmse', bagging_freq = 5,  min_child_samples=20)
+gsearch4 = GridSearchCV(estimator=model_lgb, param_grid=params_test4, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
+gsearch4.fit(df_train, y_train)
+gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
+Fitting 5 folds for each of 25 candidates, totalling 125 fits
+[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.6min
+[Parallel(n_jobs=4)]: Done 125 out of 125 | elapsed:  7.1min finished
+([mean: -1.90447, std: 0.15841, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.5},
+  mean: -1.90846, std: 0.13925, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.6},
+  mean: -1.91695, std: 0.14121, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.7},
+  mean: -1.90115, std: 0.12625, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.8},
+  mean: -1.92586, std: 0.15220, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.9},
+  mean: -1.88031, std: 0.17157, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.5},
+  mean: -1.89513, std: 0.13718, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.6},
+  mean: -1.88845, std: 0.13864, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.7},
+  mean: -1.89297, std: 0.12374, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.8},
+  mean: -1.89432, std: 0.14353, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.9},
+  mean: -1.88088, std: 0.14247, params: {'bagging_fraction': 0.8, 'feature_fraction': 0.5},
+  mean: -1.90080, std: 0.13174, params: {'bagging_fraction': 0.8, 'feature_fraction': 0.6},
+  mean: -1.88364, std: 0.14732, params: {'bagging_fraction': 0.8, 'feature_fraction': 0.7},
+  mean: -1.88987, std: 0.13344, params: {'bagging_fraction': 0.8, 'feature_fraction': 0.8},
+  mean: -1.87752, std: 0.14802, params: {'bagging_fraction': 0.8, 'feature_fraction': 0.9},
+  mean: -1.88348, std: 0.13925, params: {'bagging_fraction': 0.9, 'feature_fraction': 0.5},
+  mean: -1.87472, std: 0.13301, params: {'bagging_fraction': 0.9, 'feature_fraction': 0.6},
+  mean: -1.88656, std: 0.12241, params: {'bagging_fraction': 0.9, 'feature_fraction': 0.7},
+  mean: -1.89029, std: 0.10776, params: {'bagging_fraction': 0.9, 'feature_fraction': 0.8},
+  mean: -1.88719, std: 0.11915, params: {'bagging_fraction': 0.9, 'feature_fraction': 0.9},
+  mean: -1.86170, std: 0.12544, params: {'bagging_fraction': 1.0, 'feature_fraction': 0.5},
+  mean: -1.87334, std: 0.13099, params: {'bagging_fraction': 1.0, 'feature_fraction': 0.6},
+  mean: -1.85412, std: 0.12698, params: {'bagging_fraction': 1.0, 'feature_fraction': 0.7},
+  mean: -1.86024, std: 0.11364, params: {'bagging_fraction': 1.0, 'feature_fraction': 0.8},
+  mean: -1.87266, std: 0.12271, params: {'bagging_fraction': 1.0, 'feature_fraction': 0.9}],
+ {'bagging_fraction': 1.0, 'feature_fraction': 0.7},
+ -1.8541224387666373)
+```
+从这里可以看出来，bagging_feaction和feature_fraction的理想值分别是1.0和0.7，一个很重要原因就是，我的样本数量比较小(4000+)，但是特征数量很多(1000+)。所以，这里我们取更小的步长，对feature_fraction进行更细致的取值。
+```
+params_test5={
+    'feature_fraction': [0.62, 0.65, 0.68, 0.7, 0.72, 0.75, 0.78 ]
+}
+model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
+                              learning_rate=0.1, n_estimators=43, max_depth=7,
+                              metric='rmse',  min_child_samples=20)
+gsearch5 = GridSearchCV(estimator=model_lgb, param_grid=params_test5, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
+gsearch5.fit(df_train, y_train)
+gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
+Fitting 5 folds for each of 7 candidates, totalling 35 fits
+[Parallel(n_jobs=4)]: Done  35 out of  35 | elapsed:  2.3min finished
+([mean: -1.86696, std: 0.12658, params: {'feature_fraction': 0.62},
+  mean: -1.88337, std: 0.13215, params: {'feature_fraction': 0.65},
+  mean: -1.87282, std: 0.13193, params: {'feature_fraction': 0.68},
+  mean: -1.85412, std: 0.12698, params: {'feature_fraction': 0.7},
+  mean: -1.88235, std: 0.12682, params: {'feature_fraction': 0.72},
+  mean: -1.86329, std: 0.12757, params: {'feature_fraction': 0.75},
+  mean: -1.87943, std: 0.12107, params: {'feature_fraction': 0.78}],
+ {'feature_fraction': 0.7},
+ -1.8541224387666373)
+```
+好吧，feature_fraction就是0.7了
+**Step5: 正则化参数**
+正则化参数lambda_l1(reg_alpha), lambda_l2(reg_lambda)，毫无疑问，是降低过拟合的，两者分别对应l1正则化和l2正则化。我们也来尝试一下使用这两个参数。
+```
+params_test6={
+    'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
+    'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
+}
+model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
+                              learning_rate=0.b1, n_estimators=43, max_depth=7,
+                              metric='rmse',  min_child_samples=20, feature_fraction=0.7)
+gsearch6 = GridSearchCV(estimator=model_lgb, param_grid=params_test6, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
+gsearch6.fit(df_train, y_train)
+gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_
+Fitting 5 folds for each of 49 candidates, totalling 245 fits
+[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.8min
+[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 10.6min
+[Parallel(n_jobs=4)]: Done 245 out of 245 | elapsed: 13.3min finished
+([mean: -1.85412, std: 0.12698, params: {'reg_alpha': 0, 'reg_lambda': 0},
+  mean: -1.85990, std: 0.13296, params: {'reg_alpha': 0, 'reg_lambda': 0.001},
+  mean: -1.86367, std: 0.13634, params: {'reg_alpha': 0, 'reg_lambda': 0.01},
+  mean: -1.86787, std: 0.13881, params: {'reg_alpha': 0, 'reg_lambda': 0.03},
+  mean: -1.87099, std: 0.12476, params: {'reg_alpha': 0, 'reg_lambda': 0.08},
+  mean: -1.87670, std: 0.11849, params: {'reg_alpha': 0, 'reg_lambda': 0.3},
+  mean: -1.88278, std: 0.13064, params: {'reg_alpha': 0, 'reg_lambda': 0.5},
+  mean: -1.86190, std: 0.13613, params: {'reg_alpha': 0.001, 'reg_lambda': 0},
+  mean: -1.86190, std: 0.13613, params: {'reg_alpha': 0.001, 'reg_lambda': 0.001},
+  mean: -1.86515, std: 0.14116, params: {'reg_alpha': 0.001, 'reg_lambda': 0.01},
+  mean: -1.86908, std: 0.13668, params: {'reg_alpha': 0.001, 'reg_lambda': 0.03},
+  mean: -1.86852, std: 0.12289, params: {'reg_alpha': 0.001, 'reg_lambda': 0.08},
+  mean: -1.88076, std: 0.11710, params: {'reg_alpha': 0.001, 'reg_lambda': 0.3},
+  mean: -1.88278, std: 0.13064, params: {'reg_alpha': 0.001, 'reg_lambda': 0.5},
+  mean: -1.87480, std: 0.13889, params: {'reg_alpha': 0.01, 'reg_lambda': 0},
+  mean: -1.87284, std: 0.14138, params: {'reg_alpha': 0.01, 'reg_lambda': 0.001},
+  mean: -1.86030, std: 0.13332, params: {'reg_alpha': 0.01, 'reg_lambda': 0.01},
+  mean: -1.86695, std: 0.12587, params: {'reg_alpha': 0.01, 'reg_lambda': 0.03},
+  mean: -1.87415, std: 0.13100, params: {'reg_alpha': 0.01, 'reg_lambda': 0.08},
+  mean: -1.88543, std: 0.13195, params: {'reg_alpha': 0.01, 'reg_lambda': 0.3},
+  mean: -1.88076, std: 0.13502, params: {'reg_alpha': 0.01, 'reg_lambda': 0.5},
+  mean: -1.87729, std: 0.12533, params: {'reg_alpha': 0.03, 'reg_lambda': 0},
+  mean: -1.87435, std: 0.12034, params: {'reg_alpha': 0.03, 'reg_lambda': 0.001},
+  mean: -1.87513, std: 0.12579, params: {'reg_alpha': 0.03, 'reg_lambda': 0.01},
+  mean: -1.88116, std: 0.12218, params: {'reg_alpha': 0.03, 'reg_lambda': 0.03},
+  mean: -1.88052, std: 0.13585, params: {'reg_alpha': 0.03, 'reg_lambda': 0.08},
+  mean: -1.87565, std: 0.12200, params: {'reg_alpha': 0.03, 'reg_lambda': 0.3},
+  mean: -1.87935, std: 0.13817, params: {'reg_alpha': 0.03, 'reg_lambda': 0.5},
+  mean: -1.87774, std: 0.12477, params: {'reg_alpha': 0.08, 'reg_lambda': 0},
+  mean: -1.87774, std: 0.12477, params: {'reg_alpha': 0.08, 'reg_lambda': 0.001},
+  mean: -1.87911, std: 0.12027, params: {'reg_alpha': 0.08, 'reg_lambda': 0.01},
+  mean: -1.86978, std: 0.12478, params: {'reg_alpha': 0.08, 'reg_lambda': 0.03},
+  mean: -1.87217, std: 0.12159, params: {'reg_alpha': 0.08, 'reg_lambda': 0.08},
+  mean: -1.87573, std: 0.14137, params: {'reg_alpha': 0.08, 'reg_lambda': 0.3},
+  mean: -1.85969, std: 0.13109, params: {'reg_alpha': 0.08, 'reg_lambda': 0.5},
+  mean: -1.87632, std: 0.12398, params: {'reg_alpha': 0.3, 'reg_lambda': 0},
+  mean: -1.86995, std: 0.12651, params: {'reg_alpha': 0.3, 'reg_lambda': 0.001},
+  mean: -1.86380, std: 0.12793, params: {'reg_alpha': 0.3, 'reg_lambda': 0.01},
+  mean: -1.87577, std: 0.13002, params: {'reg_alpha': 0.3, 'reg_lambda': 0.03},
+  mean: -1.87402, std: 0.13496, params: {'reg_alpha': 0.3, 'reg_lambda': 0.08},
+  mean: -1.87032, std: 0.12504, params: {'reg_alpha': 0.3, 'reg_lambda': 0.3},
+  mean: -1.88329, std: 0.13237, params: {'reg_alpha': 0.3, 'reg_lambda': 0.5},
+  mean: -1.87196, std: 0.13099, params: {'reg_alpha': 0.5, 'reg_lambda': 0},
+  mean: -1.87196, std: 0.13099, params: {'reg_alpha': 0.5, 'reg_lambda': 0.001},
+  mean: -1.88222, std: 0.14735, params: {'reg_alpha': 0.5, 'reg_lambda': 0.01},
+  mean: -1.86618, std: 0.14006, params: {'reg_alpha': 0.5, 'reg_lambda': 0.03},
+  mean: -1.88579, std: 0.12398, params: {'reg_alpha': 0.5, 'reg_lambda': 0.08},
+  mean: -1.88297, std: 0.12307, params: {'reg_alpha': 0.5, 'reg_lambda': 0.3},
+  mean: -1.88148, std: 0.12622, params: {'reg_alpha': 0.5, 'reg_lambda': 0.5}],
+ {'reg_alpha': 0, 'reg_lambda': 0},
+ -1.8541224387666373)
+```
+哈哈，看来我多此一举了。
+**step6: 降低learning_rate**
+之前使用较高的学习速率是因为可以让收敛更快，但是准确度肯定没有细水长流来的好。最后，我们使用较低的学习速率，以及使用更多的决策树n_estimators来训练数据，看能不能可以进一步的优化分数。
+我们可以用回lightGBM的cv函数了 ，我们代入之前优化好的参数。
+```
+params = {
+    'boosting_type': 'gbdt',
+    'objective': 'regression',
+    'learning_rate': 0.005,
+    'num_leaves': 80,
+    'max_depth': 7,
+    'min_data_in_leaf': 20,
+    'subsample': 1,
+    'colsample_bytree': 0.7,
+    }
+data_train = lgb.Dataset(df_train, y_train, silent=True)
+cv_results = lgb.cv(
+    params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
+    early_stopping_rounds=50, verbose_eval=100, show_stdv=True)
+print('best n_estimators:', len(cv_results['rmse-mean']))
+print('best cv score:', cv_results['rmse-mean'][-1])
+[100]   cv_agg's rmse: 1.52939 + 0.0261756
+[200]   cv_agg's rmse: 1.43535 + 0.0187243
+[300]   cv_agg's rmse: 1.39584 + 0.0157521
+[400]   cv_agg's rmse: 1.37935 + 0.0157429
+[500]   cv_agg's rmse: 1.37313 + 0.0164503
+[600]   cv_agg's rmse: 1.37081 + 0.0172752
+[700]   cv_agg's rmse: 1.36942 + 0.0177888
+[800]   cv_agg's rmse: 1.36854 + 0.0180575
+[900]   cv_agg's rmse: 1.36817 + 0.0188776
+[1000]  cv_agg's rmse: 1.36796 + 0.0190279
+[1100]  cv_agg's rmse: 1.36783 + 0.0195969
+best n_estimators: 1079
+best cv score: 1.36772351783
+```
+这就是一个大概过程吧，其实也有更高级的方法，但是这种基本的对于GBM模型的调参方法也是需要了解的吧。如有问题，请多指教。

app.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from definitions import *
+st.set_option('deprecation.showPyplotGlobalUse', False)
+st.sidebar.subheader("请选择模型参数:sunglasses:")
+num_leaves = st.sidebar.slider(label = 'num_leaves', min_value = 4.0,
+                          max_value = 16.0 ,
+                          value = 10.0,
+                          step = 0.1)
+max_depth = st.sidebar.slider(label = 'max_depth',  min_value = 8,
+                          max_value = 15,
+                          value = 10,
+                          step = 1)
+min_data_in_leaf = st.sidebar.slider(label = 'min_data_in_leaf',  min_value = 8,
+                          max_value = 15,
+                          value = 10,
+                          step = 1)
+feature_fraction = st.sidebar.slider(label = 'feature_fraction', min_value = 0.0,
+                          max_value = 1.0 ,
+                          value = 0.3,
+                          step = 0.1)
+lambda_l1 = st.sidebar.slider(label = 'lambda_l1', min_value = 0.000,
+                          max_value = 1.000 ,
+                          value = 0.500,
+                          step = 0.001)
+lambda_l2 = st.sidebar.slider(label = 'lambda_l2', min_value = 1,
+                          max_value = 72,
+                          value = 36,
+                          step = 1)
+min_split_gain = st.sidebar.slider(label = 'min_split_gain', min_value = 6,
+                          max_value = 289 ,
+                          value = 144,
+                          step = 1)
+top_rate = st.sidebar.slider(label = 'top_rate', min_value = 0.0,
+                          max_value = 1.0 ,
+                          value = 0.3,
+                          step = 0.1)
+other_rate = st.sidebar.slider(label = 'other_rate', min_value = 0.0,
+                          max_value = 1.0 ,
+                          value = 0.3,
+                          step = 0.1)
+min_data_per_group = st.sidebar.slider(label = 'min_data_per_group', min_value = 6,
+                          max_value = 289 ,
+                          value = 32,
+                          step = 1)
+max_cat_threshold = st.sidebar.slider(label = 'max_cat_threshold', min_value = 6,
+                          max_value = 289 ,
+                          value = 32,
+                          step = 1)
+learning_rate = st.sidebar.slider(label = 'learning_rate', min_value = 8.0,
+                          max_value = 15.0,
+                          value = 10.5,
+                          step = 0.1)
+num_leaves = st.sidebar.slider(label = 'num_leaves',  min_value = 6,
+                          max_value = 289 ,
+                          value = 31,
+                          step = 1)
+min_gain_to_split  = st.sidebar.slider(label = 'min_gain_to_split', min_value = 0.0,
+                          max_value = 15.0,
+                          value = 0.0,
+                          step = 0.1)
+max_bin = st.sidebar.slider(label = 'max_bin', min_value = 6,
+                          max_value = 289 ,
+                          value = 255,
+                          step = 1)
+num_iterations = st.sidebar.slider(label = 'num_iterations', min_value = 8,
+                          max_value = 15,
+                          value = 10,
+                          step = 1)
+st.title('LightGBM-parameter-tuning-with-streamlit')
+# 加载数据
+breast_cancer = load_breast_cancer()
+data = breast_cancer.data
+target = breast_cancer.target
+# 划分训练数据和测试数据
+X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
+# 转换为Dataset数据格式
+lgb_train = lgb.Dataset(X_train, y_train)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+# 模型训练
+params = {'num_leaves': num_leaves, 'max_depth': max_depth,
+            'min_data_in_leaf': min_data_in_leaf,
+            'feature_fraction': feature_fraction,
+            'lambda_l1': lambda_l1, 'lambda_l2': lambda_l2,
+            'min_split_gain': min_split_gain, 'top_rate': top_rate,
+            'other_rate': other_rate, 'min_data_per_group': min_data_per_group,
+            'max_cat_threshold': max_cat_threshold,
+            'learning_rate':learning_rate,'num_leaves':num_leaves,'min_gain_to_split':min_gain_to_split,
+            'max_bin':max_bin,'num_iterations':num_iterations
+            }
+gbm = lgb.train(params, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=500)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+probs = gbm.predict(X_test, num_iteration=gbm.best_iteration)  # 输出的是概率结果
+fpr, tpr, thresholds = roc_curve(y_test, probs)
+st.write('------------------------------------')
+st.write('Confusion Matrix:')
+st.write(confusion_matrix(y_test, np.where(probs > 0.5, 1, 0)))
+st.write('------------------------------------')
+st.write('Classification Report:')
+report = classification_report(y_test, np.where(probs > 0.5, 1, 0), output_dict=True)
+report_matrix = pd.DataFrame(report).transpose()
+st.dataframe(report_matrix)
+st.write('------------------------------------')
+st.write('ROC:')
+plot_roc(fpr, tpr)

definitions.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pandas as pd
+import streamlit as st
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_breast_cancer
+from sklearn.metrics import roc_auc_score,roc_curve,auc,accuracy_score,classification_report,confusion_matrix,precision_recall_curve
+import lightgbm as lgb
+import matplotlib.pyplot as plt
+import warnings
+warnings.filterwarnings('ignore')
+def plot_roc(fpr, tpr, label=None):
+    roc_auc = auc(fpr, tpr)
+    plt.title('Receiver Operating Characteristic')
+    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
+    plt.legend(loc = 'lower right')
+    plt.plot([0, 1], [0, 1],'r--')
+    plt.xlim([0, 1])
+    plt.ylim([0, 1])
+    plt.ylabel('True Positive Rate')
+    plt.xlabel('False Positive Rate')
+    plt.show()
+    st.pyplot()

git.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+git add .
+git commit -m "1.0"
+git push

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas                            1.3.1
+streamlit                         1.8.1
+numpy                             1.20.3
+sklearn                           0.0
+lightgbm                          3.3.2
+matplotlib                        3.4.2