diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..11d3632b6fa2f462ed6f994d677e831ea97f3f1e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,8 @@ +*.bin filter=lfs diff=lfs merge=lfs -text +*.sav filter=lfs diff=lfs merge=lfs -text +*.csv filter=lfs diff=lfs merge=lfs -text +*.savgit filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text *.7z filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f8c0b4f276e59a90c21261021bec81240701f5ff --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 kedir + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/app/client/__pycache__/examples.cpython-38.pyc b/app/client/__pycache__/examples.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99764bd78c7cd13b48bc58db053d2bfd10203564 Binary files /dev/null and b/app/client/__pycache__/examples.cpython-38.pyc differ diff --git a/app/client/__pycache__/ner_examples.cpython-38.pyc b/app/client/__pycache__/ner_examples.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a213fca47f4ce39b6371d6e233b1daaa0e28218e Binary files /dev/null and b/app/client/__pycache__/ner_examples.cpython-38.pyc differ diff --git a/app/client/requirement.txt b/app/client/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..4952ff22651299df77921e764ef96d926e541d71 --- /dev/null +++ b/app/client/requirement.txt @@ -0,0 +1,7 @@ +streamlit +pandas +numpy +PyPDF2 +st-annotated-text +wordcloud +Pillow \ No newline at end of file diff --git a/data/GLG_test_data.csv b/data/GLG_test_data.csv new file mode 100644 index 0000000000000000000000000000000000000000..468a55c1ee7e20878a5af88fbf39195b35ce5e95 --- /dev/null +++ b/data/GLG_test_data.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52c4df02c02fc73d5691675e9f0b7ab4131a094e8bedbab520be05eeb58e2b2b +size 3286384 diff --git a/data/GLG_test_data_emb.csv b/data/GLG_test_data_emb.csv new file mode 100644 index 0000000000000000000000000000000000000000..6639f46f301ac44108fdf0d5f14f38f9bfd03ba4 --- /dev/null +++ b/data/GLG_test_data_emb.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eb13d16f0bef7bf79c642cb70d2ec94a1de5276f6e15439b379247ec11f5819 +size 54050 diff --git a/data/GLG_train_data.csv b/data/GLG_train_data.csv new file mode 100644 index 0000000000000000000000000000000000000000..ad2cb62f7e052e07f31981c727de07588c8855b6 --- /dev/null +++ b/data/GLG_train_data.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a3ec3f54c34ace7f910a0027f39b78734c0a4ee0b47aa2f5070be035800ce84 +size 29353688 diff --git a/data/GLG_train_data_emb.csv b/data/GLG_train_data_emb.csv new file mode 100644 index 0000000000000000000000000000000000000000..1981f1a7535d9c9d923966f3072a13387c10e7ac --- /dev/null +++ b/data/GLG_train_data_emb.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4669badf749f670e60c199b1382be98d7e58e300d9017113cdb0973d3088b26 +size 485927 diff --git a/data/GLG_train_data_labeled.csv b/data/GLG_train_data_labeled.csv new file mode 100644 index 0000000000000000000000000000000000000000..09736a856838a1733155413c70b0861711330564 --- /dev/null +++ b/data/GLG_train_data_labeled.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3aa1336597969beb5c45e0f3904f53c6245f0d2963374ebabb2392af03ccfb +size 29379406 diff --git a/data/data_tech_health.csv b/data/data_tech_health.csv new file mode 100644 index 0000000000000000000000000000000000000000..8de9bf5ad300e3064715c6ff7438c7b646e5e174 --- /dev/null +++ b/data/data_tech_health.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4a42ac33656056d5e1aede3216abdca4bfed591e13c2daf285c4dc2cefba4e6 +size 32639981 diff --git a/data/hierarchial_cluster.csv b/data/hierarchial_cluster.csv new file mode 100644 index 0000000000000000000000000000000000000000..0f2c9b66db0822a8ae43b28650475e75e379844f --- /dev/null +++ b/data/hierarchial_cluster.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06d1a66d7f4373e2b3c6a823202dfd608acbeba87e28f439d73f73f1e7757f6 +size 326670 diff --git a/data/ner.csv b/data/ner.csv new file mode 100644 index 0000000000000000000000000000000000000000..5976f99351f5647331edca9f30c9ac7fc27f84de --- /dev/null +++ b/data/ner.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0016023ba71542431bfec2e90c2e1cad7d9071eadcdd8782c8dcf45ea98df1 +size 157030359 diff --git a/doc/frontend_main.png b/doc/frontend_main.png new file mode 100644 index 0000000000000000000000000000000000000000..7741d9f01c012d3946bc36c3ea87a022d6110fa1 Binary files /dev/null and b/doc/frontend_main.png differ diff --git a/doc/frontend_ner.png b/doc/frontend_ner.png new file mode 100644 index 0000000000000000000000000000000000000000..974e229cda53f00bda94c2e4ba424939fac559f5 Binary files /dev/null and b/doc/frontend_ner.png differ diff --git a/doc/frontend_topic.png b/doc/frontend_topic.png new file mode 100644 index 0000000000000000000000000000000000000000..d74158420c69d86160bb14d1f2ba01c63efc7797 Binary files /dev/null and b/doc/frontend_topic.png differ diff --git a/doc/image.md b/doc/image.md new file mode 100644 index 0000000000000000000000000000000000000000..0daf65df227c00a8ee536c1c42941a49184f1b2c --- /dev/null +++ b/doc/image.md @@ -0,0 +1 @@ +List of images used in README diff --git a/doc/ner_example.png b/doc/ner_example.png new file mode 100644 index 0000000000000000000000000000000000000000..612d6d43b3856da67f641aac9c6ac65611ef78c3 Binary files /dev/null and b/doc/ner_example.png differ diff --git a/doc/topic_modeling_pipeline.png b/doc/topic_modeling_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..ce9a6e5aa9881a05c1d8daa7c187dd7590b644ab Binary files /dev/null and b/doc/topic_modeling_pipeline.png differ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..d3278da536bf065cab3aabce1dc1e3ddfeeb3f4c --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,26 @@ +version: "3.7" +services: + backendtopic: + build: src_topic + ports: + - "8000:8000" + networks: + - AIservice + + backendner: + build: src_ner + ports: + - "9000:9000" + networks: + - AIservice + + frontend: + build: ui-frontend + ports: + - "8501:8501" + networks: + - AIservice + +networks: + AIservice: + external: true \ No newline at end of file diff --git a/models/nermodels/model.pt b/models/nermodels/model.pt new file mode 100644 index 0000000000000000000000000000000000000000..15d4f9f7b664ea4c1ccb6d24e4ef1b43134c3870 --- /dev/null +++ b/models/nermodels/model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db187faad0c0dd6e7d35663fe1bf4071b08e5fcbdf298126641dd5e3d6f952b +size 433397295 diff --git a/models/topicmodels/mdl_topic_model_global_-1.bin b/models/topicmodels/mdl_topic_model_global_-1.bin new file mode 100644 index 0000000000000000000000000000000000000000..8868298e080b0d1e61e7c5f60f63b5fbe4664802 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_-1.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5993ccbc76556a0cbb8c08394b886ff0a5ae72091f38cf8b5099441adf24153d +size 204639 diff --git a/models/topicmodels/mdl_topic_model_global_0.bin b/models/topicmodels/mdl_topic_model_global_0.bin new file mode 100644 index 0000000000000000000000000000000000000000..e0041361d780a0897324ca1f21fdf47de3d2f236 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_0.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:627e65da67fe062c41df17e2784ab85cead1a4b466b92c6037ef02bc351c6546 +size 204639 diff --git a/models/topicmodels/mdl_topic_model_global_1.bin b/models/topicmodels/mdl_topic_model_global_1.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ec27bcd99c6c1f0043ac7cd6dc997a039c55321 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_1.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb3181de2990396a37b4bf34587d8e626979af6ed9a0d6bf8cf364111a763bc +size 1016311 diff --git a/models/topicmodels/mdl_topic_model_global_10.bin b/models/topicmodels/mdl_topic_model_global_10.bin new file mode 100644 index 0000000000000000000000000000000000000000..3e997970526041748ac0ac4efe3c08ba831b6978 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_10.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ea7f4eca39344e28632efb40123bc53ece2d8620f89fefae09af294c1a4df16 +size 62054 diff --git a/models/topicmodels/mdl_topic_model_global_11.bin b/models/topicmodels/mdl_topic_model_global_11.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a6347d0cf45def0f6bd15cc50db4c9982d18b4b --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_11.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b21aabafbca40868adf43bc5c5ee6ab0a8fc9f4afbeeebcd6bf9277a609988f +size 62054 diff --git a/models/topicmodels/mdl_topic_model_global_12.bin b/models/topicmodels/mdl_topic_model_global_12.bin new file mode 100644 index 0000000000000000000000000000000000000000..ec3a7fb853e9d93fd741e1cb049ce95261855e25 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_12.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12bd5e96995e4f26679480d1dce7bc58c4858f85f7907c1efc2417b8fd7710b2 +size 330222 diff --git a/models/topicmodels/mdl_topic_model_global_13.bin b/models/topicmodels/mdl_topic_model_global_13.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3039a34c685102fed24870c2d4194c0051da298 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_13.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4ed010c6e5f1cfcccdbcff1c466a3208c62f8a165583939019e93014d046506 +size 330224 diff --git a/models/topicmodels/mdl_topic_model_global_14.bin b/models/topicmodels/mdl_topic_model_global_14.bin new file mode 100644 index 0000000000000000000000000000000000000000..5756a9251acb9ba62b4d1052573e14a0792412db --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_14.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81879ac8035b921bae4968c02ee5b754613e5601111a72b65f8a83d32ae19d54 +size 71763 diff --git a/models/topicmodels/mdl_topic_model_global_15.bin b/models/topicmodels/mdl_topic_model_global_15.bin new file mode 100644 index 0000000000000000000000000000000000000000..7f2f871a648e288b614c3c6161a68b44c386b5b1 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_15.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ad10d496c029727dcbe5652692718197bcadec82ae1cb8e79b98e11b79a627 +size 9001 diff --git a/models/topicmodels/mdl_topic_model_global_16.bin b/models/topicmodels/mdl_topic_model_global_16.bin new file mode 100644 index 0000000000000000000000000000000000000000..abcb770d51f51ab5644c0ab06de3af27995c4cdd --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_16.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3acc86676ff2d30b71a70f885734de53647f0478df0d199c63034bc0dd40c46d +size 22810 diff --git a/models/topicmodels/mdl_topic_model_global_17.bin b/models/topicmodels/mdl_topic_model_global_17.bin new file mode 100644 index 0000000000000000000000000000000000000000..932bf0aeac591dcc24378462c7b24351f58cf4b7 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_17.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9eb68e96efc32d42898e58c864a69b9ba7638bbd776a4cb1186769dc07ad63b +size 1040089 diff --git a/models/topicmodels/mdl_topic_model_global_18.bin b/models/topicmodels/mdl_topic_model_global_18.bin new file mode 100644 index 0000000000000000000000000000000000000000..84c5fe659f9b4156cf614164efabbcf3e4372e66 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_18.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5648d04e9e6705256b2ec581afb5c16541cf4824e3237819502a6f06f1e5998b +size 1032674 diff --git a/models/topicmodels/mdl_topic_model_global_19.bin b/models/topicmodels/mdl_topic_model_global_19.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8a4355ee11c70b400dbf876f6fb15421e86f8c9 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_19.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1c123ced85f1dc50dfb8fb65f2cd860cef2894a9f9614e55a0421abf98a8113 +size 87692 diff --git a/models/topicmodels/mdl_topic_model_global_2.bin b/models/topicmodels/mdl_topic_model_global_2.bin new file mode 100644 index 0000000000000000000000000000000000000000..3c7da414e88d62230f5b6bbfd6ac694b5009ed6b --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_2.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6afdac1aee771038cdcb581c542d9b9389daa5b2b58105e4df39b7a2a768cf65 +size 286107 diff --git a/models/topicmodels/mdl_topic_model_global_20.bin b/models/topicmodels/mdl_topic_model_global_20.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a5f706fca37335713da2e78f914da5658460637 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_20.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6832d9cb0b838680f5ac82da144fdd8a887f244dac30e94b000951ff7c6ab17e +size 691600 diff --git a/models/topicmodels/mdl_topic_model_global_21.bin b/models/topicmodels/mdl_topic_model_global_21.bin new file mode 100644 index 0000000000000000000000000000000000000000..f82cc52967ef175cb0cb08887b18bc96706810c6 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_21.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c03cb254275658e3d1e399a40bfd290dc280ca6d324b644303ede6a1268e6f +size 386978 diff --git a/models/topicmodels/mdl_topic_model_global_22.bin b/models/topicmodels/mdl_topic_model_global_22.bin new file mode 100644 index 0000000000000000000000000000000000000000..dde23a9018b8182462cd06da5ff2372b12fbccf2 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_22.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d91804cbff80beeb27936477531de15c8d303b2b50e9b77b66d876466fa9100 +size 2242937 diff --git a/models/topicmodels/mdl_topic_model_global_3.bin b/models/topicmodels/mdl_topic_model_global_3.bin new file mode 100644 index 0000000000000000000000000000000000000000..f7892a9ff9ec33cfeeb59731f2b0eaa417b44bc4 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_3.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:533177cbd23bd33a271890c09bb114737830ff603bd864d4304933546c8c8f28 +size 6693 diff --git a/models/topicmodels/mdl_topic_model_global_4.bin b/models/topicmodels/mdl_topic_model_global_4.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fe22da18c518bddd201784907b0a44791d1d6f1 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_4.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1a20d70ae0d4d9ce3d1473aa53930d6f3f5c2a8f4dd2f75cbb73f6bd3a620d5 +size 885 diff --git a/models/topicmodels/mdl_topic_model_global_5.bin b/models/topicmodels/mdl_topic_model_global_5.bin new file mode 100644 index 0000000000000000000000000000000000000000..18090f904b6654d18ef57b8c3cf180575fb0d5a7 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_5.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ca53907d41bb936b7ad7ff1d304ddd2f628e283fb2954dfe93cbf1d595a1f9 +size 232539 diff --git a/models/topicmodels/mdl_topic_model_global_6.bin b/models/topicmodels/mdl_topic_model_global_6.bin new file mode 100644 index 0000000000000000000000000000000000000000..4eb0bb151e5e912d9989d89326f1c816574a7f71 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_6.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e50ece23ca797d682ecca9a84b08f0cab23bed505df65662fb508bdb6a8f539 +size 18182 diff --git a/models/topicmodels/mdl_topic_model_global_7.bin b/models/topicmodels/mdl_topic_model_global_7.bin new file mode 100644 index 0000000000000000000000000000000000000000..2b52cd900d28ade7d60fc75c621aac6fc1471191 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_7.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52ef1e6e5e7cd7a046f05eef9ea9e15a8f2719079929558fb3141f8e7d9c7d3d +size 18182 diff --git a/models/topicmodels/mdl_topic_model_global_8.bin b/models/topicmodels/mdl_topic_model_global_8.bin new file mode 100644 index 0000000000000000000000000000000000000000..7977db03f28bb0cf010d587634c9866fe9970a7a --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_8.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9240784cc4f15d21812f25868db18d8df436c21eea8d6a4e4ded19a3418226f4 +size 206473 diff --git a/models/topicmodels/mdl_topic_model_global_9.bin b/models/topicmodels/mdl_topic_model_global_9.bin new file mode 100644 index 0000000000000000000000000000000000000000..cbcbc9a980b5e42420dc40d8df98d9602413a2b5 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_global_9.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c8f1bf1b5dbf19ba829eb4fd5441192212de56733aed482ddd45a6d8e22a75 +size 314923 diff --git a/models/topicmodels/mdl_topic_model_local_-1.bin b/models/topicmodels/mdl_topic_model_local_-1.bin new file mode 100644 index 0000000000000000000000000000000000000000..c9922287c7fed95bfd13bcfbad7044c8826cef54 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_-1.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bd30ddba6458c56ec62729d897450b0bcea94b994a63b8204997dedf0383984 +size 4625635 diff --git a/models/topicmodels/mdl_topic_model_local_0.bin b/models/topicmodels/mdl_topic_model_local_0.bin new file mode 100644 index 0000000000000000000000000000000000000000..701ff69b838a51279b1f14f8e2eaa22cae1cfa24 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_0.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b14b12624eabf92060f07632c1ffdccbed7025fbd9d0c3479e05dc4dc9f069 +size 541159 diff --git a/models/topicmodels/mdl_topic_model_local_1.bin b/models/topicmodels/mdl_topic_model_local_1.bin new file mode 100644 index 0000000000000000000000000000000000000000..353b1c183aa653232cc1ad9e8a9e389a0bbec0aa --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_1.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc500c9c5364a359898f2e54ff1d763bbc9b9c8be59c99af10b6b25623105da8 +size 2996229 diff --git a/models/topicmodels/mdl_topic_model_local_10.bin b/models/topicmodels/mdl_topic_model_local_10.bin new file mode 100644 index 0000000000000000000000000000000000000000..6134604388781e0a2828d8bd069cf79d667d34d5 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_10.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b5a3592dc874e76e31e41478d4effbc563190dbefe8d94c24c832f66dd8e093 +size 655701 diff --git a/models/topicmodels/mdl_topic_model_local_11.bin b/models/topicmodels/mdl_topic_model_local_11.bin new file mode 100644 index 0000000000000000000000000000000000000000..5ec69f62634130599692a02d3016d4317ae08544 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_11.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eca7b2d803645e60aa2e52f3f75d2741b0de5d57cbea3429c289d668ea246d4 +size 1239019 diff --git a/models/topicmodels/mdl_topic_model_local_12.bin b/models/topicmodels/mdl_topic_model_local_12.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bfa4aca0dd09ff42940c7344b36f85f79cc4884 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_12.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67abd54c5bafa8addf71cd382cef671cd40756a5a33682b147807b8f93f292e3 +size 781389 diff --git a/models/topicmodels/mdl_topic_model_local_13.bin b/models/topicmodels/mdl_topic_model_local_13.bin new file mode 100644 index 0000000000000000000000000000000000000000..f96438f3dafc41467dc18c092a9c3706b87cc723 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_13.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aa233867e151202d798236bb19c8c72005afd887c755900963a05bea3eaa7ab +size 700141 diff --git a/models/topicmodels/mdl_topic_model_local_14.bin b/models/topicmodels/mdl_topic_model_local_14.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a3fa1333b10293c5d85ba180fe78fd35e994976 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_14.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64f1c50dc06f5b24859075b13b80c9e7749d03be60770e166939fa85f5c80a1f +size 899770 diff --git a/models/topicmodels/mdl_topic_model_local_15.bin b/models/topicmodels/mdl_topic_model_local_15.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c6748b3013969e9ee560d9e74db503248cae85 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_15.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5451967edb32b31b78a968579d16d84da0c9db7d18f6d6c6d9e7a93027173a6e +size 636186 diff --git a/models/topicmodels/mdl_topic_model_local_16.bin b/models/topicmodels/mdl_topic_model_local_16.bin new file mode 100644 index 0000000000000000000000000000000000000000..052910be41127dbf00cf5142864964aa22497610 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_16.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c7da3d20c28a6c583c2de9b9def5ed6ed93756f1c661a4d3cc759e3a03bb804 +size 903305 diff --git a/models/topicmodels/mdl_topic_model_local_17.bin b/models/topicmodels/mdl_topic_model_local_17.bin new file mode 100644 index 0000000000000000000000000000000000000000..b603ca0a932ed667e4cc17164dd5ce17d0fad086 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_17.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0e3f107ab7d216ea50c090a02312f13ba0103ecaa41b3b2c1cad1de6b31238c +size 1819446 diff --git a/models/topicmodels/mdl_topic_model_local_18.bin b/models/topicmodels/mdl_topic_model_local_18.bin new file mode 100644 index 0000000000000000000000000000000000000000..dc5e1a7c4905ce00271586af205e289eedcebb54 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_18.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efb9db09b0580c162dbf3883759ab8df338a184ba150ff174f4f38f986c2c9e8 +size 1116275 diff --git a/models/topicmodels/mdl_topic_model_local_19.bin b/models/topicmodels/mdl_topic_model_local_19.bin new file mode 100644 index 0000000000000000000000000000000000000000..dfb06a0b4393914545f0964743265de2fc043fe3 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_19.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b4c14bef781e71c7ce15e49e416b0daf0bdb8d4058551bd1ccb25ad42d14c5 +size 546279 diff --git a/models/topicmodels/mdl_topic_model_local_2.bin b/models/topicmodels/mdl_topic_model_local_2.bin new file mode 100644 index 0000000000000000000000000000000000000000..1009fe3d8c8ce54dac5e805399dd34ffd110a051 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_2.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:724fde299daa0367201be4a57f282822b6a5bb415bde11eff9d156974567bbf4 +size 1176966 diff --git a/models/topicmodels/mdl_topic_model_local_20.bin b/models/topicmodels/mdl_topic_model_local_20.bin new file mode 100644 index 0000000000000000000000000000000000000000..922121081c8b188f33fc33e4acc06e4693db827e --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_20.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cc7b322fa0d32ebfa9d8f6335d20bbfee92decf20a5ded41c1179cc7e5a849e +size 1364426 diff --git a/models/topicmodels/mdl_topic_model_local_21.bin b/models/topicmodels/mdl_topic_model_local_21.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f4d63ed727f0f95fbc7d8e0c417afc50a3fa1a6 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_21.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4753832a692347c22ab55c7bc72782ce5de1d73afe404674c27618bb60c81f06 +size 1082315 diff --git a/models/topicmodels/mdl_topic_model_local_22.bin b/models/topicmodels/mdl_topic_model_local_22.bin new file mode 100644 index 0000000000000000000000000000000000000000..8aa63597405e4f4baf62e4306c60f23d42d5f57f --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_22.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b48a5c0cc376731a794a0de896e00096823dd9ca90d2bc4f23e62c6ac576a70a +size 3239369 diff --git a/models/topicmodels/mdl_topic_model_local_3.bin b/models/topicmodels/mdl_topic_model_local_3.bin new file mode 100644 index 0000000000000000000000000000000000000000..125303ebfbbbb55a1f732a07edca19a636797104 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_3.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f5611e2e7d2ec50451b2dd3a4f1f1b361d9ef5b4a9490f804936b3040dc7110 +size 1824748 diff --git a/models/topicmodels/mdl_topic_model_local_4.bin b/models/topicmodels/mdl_topic_model_local_4.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ab70d95a450997357a46f27f2b141418d52adad --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_4.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3acf4b0955a52ed28757cda157f44e3b34e9eb48f5818d9dd13d277ed806a74f +size 582164 diff --git a/models/topicmodels/mdl_topic_model_local_5.bin b/models/topicmodels/mdl_topic_model_local_5.bin new file mode 100644 index 0000000000000000000000000000000000000000..a6cef32e2378597a7633981341900d265e3ff5ea --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_5.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d14b41b67090a491b400bc851bb80f6f066db70447c59e2e2c791e384c115728 +size 510608 diff --git a/models/topicmodels/mdl_topic_model_local_6.bin b/models/topicmodels/mdl_topic_model_local_6.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1884d6c3ccd92710f8abc00e36573bcfa0987c3 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_6.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f82ff2867d3763b00f2cdf825dd9fabf0116bb6d83a79804aaedcd0c70cc6f76 +size 886083 diff --git a/models/topicmodels/mdl_topic_model_local_7.bin b/models/topicmodels/mdl_topic_model_local_7.bin new file mode 100644 index 0000000000000000000000000000000000000000..3cbd89bdfc178c86e903b386d2ea184da3ce699d --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_7.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd10d0c9f8a8bd5473f0ce5bacc09162260cc37f168ee01bad4115438d75637f +size 761963 diff --git a/models/topicmodels/mdl_topic_model_local_8.bin b/models/topicmodels/mdl_topic_model_local_8.bin new file mode 100644 index 0000000000000000000000000000000000000000..729f1785cfcdc95453b77c305d9ec838322669df --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_8.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c457e63e788a90274b8a06f9e14e1d67aa8c5685797b00ac3282b00077e8fa +size 815216 diff --git a/models/topicmodels/mdl_topic_model_local_9.bin b/models/topicmodels/mdl_topic_model_local_9.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b2fb0307c0a070076a0d789aafc113c2a10fb58 --- /dev/null +++ b/models/topicmodels/mdl_topic_model_local_9.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:484c5d8898115d7620d135fdfdf3783d25c329474db8884f542fbfd98e511b6a +size 640296 diff --git a/notebooks/GLG-1-News-Article-Data-Exploration.ipynb b/notebooks/GLG-1-News-Article-Data-Exploration.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..4750264d8753e3cf801307498958b0e8423219dd --- /dev/null +++ b/notebooks/GLG-1-News-Article-Data-Exploration.ipynb @@ -0,0 +1,2427 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "qIFLx0_wimTB" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "pd.set_option('max_colwidth',150)\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from datetime import datetime as dt\n", + "from string import punctuation\n", + "import re\n", + "import os\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = \"all\" # allow multiple outputs in a cell\n", + "import warnings\n", + "import pandas as pd\n", + "pd.options.plotting.backend = \"plotly\"\n", + "warnings.filterwarnings(\"ignore\")\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Download and Extract the Datasets" + ], + "metadata": { + "id": "QqvaLRjVjIj3" + } + }, + { + "cell_type": "code", + "source": [ + "# Downloading all-the-news-2-news-articles-dataset \n", + "! wget https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip?dl=0\n", + "\n", + "# Downloading Annotated Corpus for Named Entity Recognition dataset\n", + "!gdown https://drive.google.com/uc?id=13y8JNgL5TQ4x-yufpBOv3QBsEiE051sE\n", + "\n", + "# Make a data folder to store the data\n", + "!mkdir data\n", + "\n", + "!unzip /content/all-the-news-2-1.zip?dl=0 -d ./data/\n", + "\n", + "!mv /content/ner.csv ./data\n", + "\n", + "!rm /content/all-the-news-2-1.zip?dl=0\n", + "\n" + ], + "metadata": { + "id": "VYvJeKsujCFY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Load Data" + ], + "metadata": { + "id": "liJiX3Xf2hQh" + } + }, + { + "cell_type": "code", + "source": [ + "#specify the path to data location\n", + "\n", + "filepath = '/content/data/all-the-news-2-1.csv'\n", + "# data = pd.read_csv(filepath, encoding = \"ISO-8859-1\")\n", + "data = pd.read_csv(filepath, encoding = \"utf-8\") \n" + ], + "metadata": { + "id": "LMwtt2rJnNhB" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Verify that the data is loaded correctly\n", + "data.head(3)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "g4VoxOSnnOs9", + "outputId": "4f0dea96-29e8-4f80-f009-12e9ef6e0c05" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " date year month day author \\\n", + "0 2016-12-09 18:31:00 2016 12.0 9 Lee Drutman \n", + "1 2016-10-07 21:26:46 2016 10.0 7 Scott Davis \n", + "2 2018-01-26 00:00:00 2018 1.0 26 NaN \n", + "\n", + " title \\\n", + "0 We should take concerns about the health of liberal democracy seriously \n", + "1 Colts GM Ryan Grigson says Andrew Luck's contract makes it difficult to build the team \n", + "2 Trump denies report he ordered Mueller fired \n", + "\n", + " article \\\n", + "0 This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de... \n", + "1 The Indianapolis Colts made Andrew Luck the highest-paid player in NFL history this offseason with a five-year, $122-million contract with $89 mi... \n", + "2 DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday that he had ordered Special Counsel Robert Mueller fired last... \n", + "\n", + " url \\\n", + "0 https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs \n", + "1 https://www.businessinsider.com/colts-gm-ryan-grigson-andrew-luck-contract-2016-10 \n", + "2 https://www.reuters.com/article/us-davos-meeting-trump-mueller/trump-denies-report-he-ordered-mueller-fired-idUSKBN1FF12A \n", + "\n", + " section publication \n", + "0 NaN Vox \n", + "1 NaN Business Insider \n", + "2 Davos Reuters " + ], + "text/html": [ + "\n", + "
\n", + " | date | \n", + "year | \n", + "month | \n", + "day | \n", + "author | \n", + "title | \n", + "article | \n", + "url | \n", + "section | \n", + "publication | \n", + "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "2016-12-09 18:31:00 | \n", + "2016 | \n", + "12.0 | \n", + "9 | \n", + "Lee Drutman | \n", + "We should take concerns about the health of liberal democracy seriously | \n", + "This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de... | \n", + "https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs | \n", + "NaN | \n", + "Vox | \n", + "
1 | \n", + "2016-10-07 21:26:46 | \n", + "2016 | \n", + "10.0 | \n", + "7 | \n", + "Scott Davis | \n", + "Colts GM Ryan Grigson says Andrew Luck's contract makes it difficult to build the team | \n", + "The Indianapolis Colts made Andrew Luck the highest-paid player in NFL history this offseason with a five-year, $122-million contract with $89 mi... | \n", + "https://www.businessinsider.com/colts-gm-ryan-grigson-andrew-luck-contract-2016-10 | \n", + "NaN | \n", + "Business Insider | \n", + "
2 | \n", + "2018-01-26 00:00:00 | \n", + "2018 | \n", + "1.0 | \n", + "26 | \n", + "NaN | \n", + "Trump denies report he ordered Mueller fired | \n", + "DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday that he had ordered Special Counsel Robert Mueller fired last... | \n", + "https://www.reuters.com/article/us-davos-meeting-trump-mueller/trump-denies-report-he-ordered-mueller-fired-idUSKBN1FF12A | \n", + "Davos | \n", + "Reuters | \n", + "
\n", + " | date | \n", + "year | \n", + "month | \n", + "day | \n", + "author | \n", + "title | \n", + "article | \n", + "url | \n", + "section | \n", + "publication | \n", + "tech_health_tag | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "2018-05-02 17:09:00 | \n", + "2018 | \n", + "5.0 | \n", + "2 | \n", + "Caroline Williams | \n", + "You Can Trick Your Brain Into Being More Focused | \n", + "If only every day could be like this. You can’t put your finger on why: Maybe you had just the right amount of sleep. Maybe the stars are somehow ... | \n", + "https://www.vice.com/en_us/article/9kgp4v/how-to-improve-focus-be-more-creative | \n", + "Health | \n", + "Vice | \n", + "health | \n", + "
1 | \n", + "2018-10-05 19:35:00 | \n", + "2018 | \n", + "10.0 | \n", + "5 | \n", + "Caroline Haskins | \n", + "Trash Geyser Spews Garbage In Yellowstone National Park | \n", + "Geyser eruptions are known as one of the most beautiful events to occur in nature. Not anymore! On September 15, Yellowstone Park’s Ear Spring ge... | \n", + "https://www.vice.com/en_us/article/evwq47/ear-spring-geyser-spews-trash-in-yellowstone-national-park | \n", + "Tech by VICE | \n", + "Vice | \n", + "technology | \n", + "
2 | \n", + "2019-06-20 00:00:00 | \n", + "2019 | \n", + "6.0 | \n", + "20 | \n", + "Gergely Szakacs | \n", + "Hungary has no evidence of Huawei threat, plans rapid 5G rollout: minister | \n", + "BUDAPEST (Reuters) - Hungary has no evidence that equipment from Chinese telecoms giant Huawei poses a security threat, a government minister said... | \n", + "https://www.reuters.com/article/us-hungary-telecoms-5g-huawei/hungary-has-no-evidence-of-huawei-threat-plans-rapid-5g-rollout-minister-idUSKCN1TL2AP | \n", + "Technology News | \n", + "Reuters | \n", + "technology | \n", + "
\n", + " | 0 | \n", + "1 | \n", + "2 | \n", + "3 | \n", + "4 | \n", + "
---|---|---|---|---|---|
Unnamed: 0 | \n", + "0 | \n", + "1 | \n", + "2 | \n", + "3 | \n", + "4 | \n", + "
lemma | \n", + "thousand | \n", + "of | \n", + "demonstr | \n", + "have | \n", + "march | \n", + "
next-lemma | \n", + "of | \n", + "demonstr | \n", + "have | \n", + "march | \n", + "through | \n", + "
next-next-lemma | \n", + "demonstr | \n", + "have | \n", + "march | \n", + "through | \n", + "london | \n", + "
next-next-pos | \n", + "NNS | \n", + "VBP | \n", + "VBN | \n", + "IN | \n", + "NNP | \n", + "
next-next-shape | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "capitalized | \n", + "
next-next-word | \n", + "demonstrators | \n", + "have | \n", + "marched | \n", + "through | \n", + "London | \n", + "
next-pos | \n", + "IN | \n", + "NNS | \n", + "VBP | \n", + "VBN | \n", + "IN | \n", + "
next-shape | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "
next-word | \n", + "of | \n", + "demonstrators | \n", + "have | \n", + "marched | \n", + "through | \n", + "
pos | \n", + "NNS | \n", + "IN | \n", + "NNS | \n", + "VBP | \n", + "VBN | \n", + "
prev-iob | \n", + "__START1__ | \n", + "O | \n", + "O | \n", + "O | \n", + "O | \n", + "
prev-lemma | \n", + "__start1__ | \n", + "thousand | \n", + "of | \n", + "demonstr | \n", + "have | \n", + "
prev-pos | \n", + "__START1__ | \n", + "NNS | \n", + "IN | \n", + "NNS | \n", + "VBP | \n", + "
prev-prev-iob | \n", + "__START2__ | \n", + "__START1__ | \n", + "O | \n", + "O | \n", + "O | \n", + "
prev-prev-lemma | \n", + "__start2__ | \n", + "__start1__ | \n", + "thousand | \n", + "of | \n", + "demonstr | \n", + "
prev-prev-pos | \n", + "__START2__ | \n", + "__START1__ | \n", + "NNS | \n", + "IN | \n", + "NNS | \n", + "
prev-prev-shape | \n", + "wildcard | \n", + "wildcard | \n", + "capitalized | \n", + "lowercase | \n", + "lowercase | \n", + "
prev-prev-word | \n", + "__START2__ | \n", + "__START1__ | \n", + "Thousands | \n", + "of | \n", + "demonstrators | \n", + "
prev-shape | \n", + "wildcard | \n", + "capitalized | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "
prev-word | \n", + "__START1__ | \n", + "Thousands | \n", + "of | \n", + "demonstrators | \n", + "have | \n", + "
sentence_idx | \n", + "1.0 | \n", + "1.0 | \n", + "1.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
shape | \n", + "capitalized | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "lowercase | \n", + "
word | \n", + "Thousands | \n", + "of | \n", + "demonstrators | \n", + "have | \n", + "marched | \n", + "
tag | \n", + "O | \n", + "O | \n", + "O | \n", + "O | \n", + "O | \n", + "
\n", + " | sentence_idx | \n", + "word | \n", + "tag | \n", + "
---|---|---|---|
0 | \n", + "1.0 | \n", + "Thousands | \n", + "O | \n", + "
1 | \n", + "1.0 | \n", + "of | \n", + "O | \n", + "
2 | \n", + "1.0 | \n", + "demonstrators | \n", + "O | \n", + "
3 | \n", + "1.0 | \n", + "have | \n", + "O | \n", + "
4 | \n", + "1.0 | \n", + "marched | \n", + "O | \n", + "
\n", + " | sentences | \n", + "
---|---|
0 | \n", + "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country . ... | \n", + "
1 | \n", + "Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as \" Bush Number One Terrorist \" and \" Sto... | \n", + "
2 | \n", + "They marched from the Houses of Parliament to a rally in Hyde Park . They marched from the Houses of Parliament to a rally in Hyde Park . | \n", + "
3 | \n", + "Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 . Police put the number of marchers at 10,000 while organizer... | \n", + "
4 | \n", + "The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton . The pr... | \n", + "
\n"," | 0 | \n","1 | \n","2 | \n","3 | \n","4 | \n","
---|---|---|---|---|---|
0 | \n","5.829379 | \n","7.188879 | \n","5.905200 | \n","-0.412281 | \n","8.191950 | \n","
1 | \n","6.449986 | \n","6.307540 | \n","7.369937 | \n","-0.007682 | \n","7.781770 | \n","
2 | \n","6.641083 | \n","8.506640 | \n","6.149587 | \n","-0.332711 | \n","7.944887 | \n","
3 | \n","7.147574 | \n","6.795663 | \n","5.663146 | \n","0.219597 | \n","7.048291 | \n","
4 | \n","1.926250 | \n","7.740953 | \n","5.823658 | \n","0.978812 | \n","7.035870 | \n","
\n"," | 0 | \n","1 | \n","2 | \n","3 | \n","4 | \n","
---|---|---|---|---|---|
0 | \n","6.384241 | \n","6.152116 | \n","6.909705 | \n","0.143703 | \n","7.433092 | \n","
1 | \n","4.364654 | \n","2.928921 | \n","4.393867 | \n","1.090112 | \n","7.379026 | \n","
2 | \n","6.726593 | \n","8.498932 | \n","6.248105 | \n","-0.239759 | \n","7.818388 | \n","
3 | \n","7.369310 | \n","5.427250 | \n","4.332436 | \n","0.281037 | \n","7.733836 | \n","
4 | \n","6.765358 | \n","4.768935 | \n","4.028739 | \n","0.633608 | \n","7.600544 | \n","
\n", + " | parent | \n", + "child | \n", + "lambda_val | \n", + "child_size | \n", + "
---|---|---|---|---|
0 | \n", + "9900 | \n", + "5759 | \n", + "0.337298 | \n", + "1 | \n", + "
1 | \n", + "9900 | \n", + "1854 | \n", + "0.337298 | \n", + "1 | \n", + "
2 | \n", + "9900 | \n", + "2322 | \n", + "0.337298 | \n", + "1 | \n", + "
3 | \n", + "9900 | \n", + "4936 | \n", + "0.337298 | \n", + "1 | \n", + "
4 | \n", + "9900 | \n", + "7039 | \n", + "0.337298 | \n", + "1 | \n", + "
\n", + " | date | \n", + "year | \n", + "month | \n", + "day | \n", + "title | \n", + "article | \n", + "url | \n", + "section | \n", + "publication | \n", + "tech_health_tag | \n", + "article_word_len | \n", + "cluster_label | \n", + "preprocessed_article | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "2018-08-09 09:11:14 | \n", + "2018 | \n", + "8.0 | \n", + "9 | \n", + "Psychologists’ Group Maintains Ban on Work at ... | \n", + "MIND Members of the American Psychological Ass... | \n", + "https://www.nytimes.com/2018/08/09/health/inte... | \n", + "health | \n", + "The New York Times | \n", + "health | \n", + "700 | \n", + "22 | \n", + "mind members american psychological associatio... | \n", + "
1 | \n", + "2016-04-26 00:00:00 | \n", + "2016 | \n", + "4.0 | \n", + "26 | \n", + "Prince autopsy: What examiners looked for | \n", + "(CNN)Pop superstar Prince died from an accide... | \n", + "https://www.cnn.com/2016/04/26/health/prince-d... | \n", + "health | \n", + "CNN | \n", + "health | \n", + "889 | \n", + "9 | \n", + "superstar prince died accidental overdose opio... | \n", + "
\n", + " | parent | \n", + "child | \n", + "lambda_val | \n", + "child_size | \n", + "cluster_label | \n", + "
---|---|---|---|---|---|
954 | \n", + "9909 | \n", + "1088 | \n", + "3.316230 | \n", + "1 | \n", + "-1 | \n", + "
959 | \n", + "9909 | \n", + "9913 | \n", + "3.333467 | \n", + "575 | \n", + "P | \n", + "
960 | \n", + "9909 | \n", + "9914 | \n", + "3.333467 | \n", + "381 | \n", + "P | \n", + "
\n", + " | Unnamed: 0 | \n", + "lemma | \n", + "next-lemma | \n", + "next-next-lemma | \n", + "next-next-pos | \n", + "next-next-shape | \n", + "next-next-word | \n", + "next-pos | \n", + "next-shape | \n", + "next-word | \n", + "... | \n", + "prev-prev-lemma | \n", + "prev-prev-pos | \n", + "prev-prev-shape | \n", + "prev-prev-word | \n", + "prev-shape | \n", + "prev-word | \n", + "sentence_idx | \n", + "shape | \n", + "word | \n", + "tag | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "thousand | \n", + "of | \n", + "demonstr | \n", + "NNS | \n", + "lowercase | \n", + "demonstrators | \n", + "IN | \n", + "lowercase | \n", + "of | \n", + "... | \n", + "__start2__ | \n", + "__START2__ | \n", + "wildcard | \n", + "__START2__ | \n", + "wildcard | \n", + "__START1__ | \n", + "1.0 | \n", + "capitalized | \n", + "Thousands | \n", + "O | \n", + "
1 | \n", + "1 | \n", + "of | \n", + "demonstr | \n", + "have | \n", + "VBP | \n", + "lowercase | \n", + "have | \n", + "NNS | \n", + "lowercase | \n", + "demonstrators | \n", + "... | \n", + "__start1__ | \n", + "__START1__ | \n", + "wildcard | \n", + "__START1__ | \n", + "capitalized | \n", + "Thousands | \n", + "1.0 | \n", + "lowercase | \n", + "of | \n", + "O | \n", + "
2 | \n", + "2 | \n", + "demonstr | \n", + "have | \n", + "march | \n", + "VBN | \n", + "lowercase | \n", + "marched | \n", + "VBP | \n", + "lowercase | \n", + "have | \n", + "... | \n", + "thousand | \n", + "NNS | \n", + "capitalized | \n", + "Thousands | \n", + "lowercase | \n", + "of | \n", + "1.0 | \n", + "lowercase | \n", + "demonstrators | \n", + "O | \n", + "
3 | \n", + "3 | \n", + "have | \n", + "march | \n", + "through | \n", + "IN | \n", + "lowercase | \n", + "through | \n", + "VBN | \n", + "lowercase | \n", + "marched | \n", + "... | \n", + "of | \n", + "IN | \n", + "lowercase | \n", + "of | \n", + "lowercase | \n", + "demonstrators | \n", + "1.0 | \n", + "lowercase | \n", + "have | \n", + "O | \n", + "
4 | \n", + "4 | \n", + "march | \n", + "through | \n", + "london | \n", + "NNP | \n", + "capitalized | \n", + "London | \n", + "IN | \n", + "lowercase | \n", + "through | \n", + "... | \n", + "demonstr | \n", + "NNS | \n", + "lowercase | \n", + "demonstrators | \n", + "lowercase | \n", + "have | \n", + "1.0 | \n", + "lowercase | \n", + "marched | \n", + "O | \n", + "
5 rows × 25 columns
\n", + "\n", + " | Unnamed: 0 | \n", + "sentence_idx | \n", + "
---|---|---|
count | \n", + "1.050795e+06 | \n", + "1.050794e+06 | \n", + "
mean | \n", + "4.153109e+05 | \n", + "1.898184e+04 | \n", + "
std | \n", + "3.447835e+05 | \n", + "1.576237e+04 | \n", + "
min | \n", + "0.000000e+00 | \n", + "1.000000e+00 | \n", + "
25% | \n", + "1.313490e+05 | \n", + "5.997000e+03 | \n", + "
50% | \n", + "2.626980e+05 | \n", + "1.201700e+04 | \n", + "
75% | \n", + "7.858755e+05 | \n", + "3.592600e+04 | \n", + "
max | \n", + "1.048574e+06 | \n", + "4.795900e+04 | \n", + "
\n", + " | sentence_idx | \n", + "word | \n", + "pos | \n", + "tag | \n", + "
---|---|---|---|---|
0 | \n", + "1.0 | \n", + "Thousands | \n", + "NNS | \n", + "O | \n", + "
1 | \n", + "1.0 | \n", + "of | \n", + "IN | \n", + "O | \n", + "
2 | \n", + "1.0 | \n", + "demonstrators | \n", + "NNS | \n", + "O | \n", + "
3 | \n", + "1.0 | \n", + "have | \n", + "VBP | \n", + "O | \n", + "
4 | \n", + "1.0 | \n", + "marched | \n", + "VBN | \n", + "O | \n", + "
\n", + " | Component_1 | \n", + "Component_2 | \n", + "label | \n", + "
---|---|---|---|
0 | \n", + "0.016871 | \n", + "0.062809 | \n", + "health | \n", + "
1 | \n", + "-0.159731 | \n", + "-0.099900 | \n", + "technology | \n", + "
2 | \n", + "0.047788 | \n", + "0.051254 | \n", + "health | \n", + "
3 | \n", + "-0.196270 | \n", + "-0.078477 | \n", + "technology | \n", + "
4 | \n", + "-0.209384 | \n", + "-0.036187 | \n", + "health | \n", + "
\n", + " | Component_1 | \n", + "Component_2 | \n", + "Component_3 | \n", + "Component_4 | \n", + "Component_5 | \n", + "label | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "7.332139 | \n", + "9.666074 | \n", + "5.394394 | \n", + "-2.601379 | \n", + "2.772509 | \n", + "health | \n", + "
1 | \n", + "4.093304 | \n", + "9.589003 | \n", + "6.360199 | \n", + "-1.844557 | \n", + "11.277091 | \n", + "technology | \n", + "
2 | \n", + "8.916203 | \n", + "8.999536 | \n", + "1.369477 | \n", + "-1.545412 | \n", + "1.207037 | \n", + "health | \n", + "
3 | \n", + "4.303977 | \n", + "9.261281 | \n", + "5.846497 | \n", + "-0.754982 | \n", + "10.190146 | \n", + "technology | \n", + "
4 | \n", + "1.404892 | \n", + "10.766094 | \n", + "-0.089052 | \n", + "1.422105 | \n", + "2.829908 | \n", + "health | \n", + "
\n", + " | Component_1 | \n", + "Component_2 | \n", + "Component_3 | \n", + "Component_4 | \n", + "Component_5 | \n", + "
---|---|---|---|---|---|
0 | \n", + "5.263366 | \n", + "2.210764 | \n", + "-2.252100 | \n", + "-0.922481 | \n", + "8.446700 | \n", + "
1 | \n", + "-0.602789 | \n", + "3.424239 | \n", + "-3.034621 | \n", + "5.537161 | \n", + "10.526425 | \n", + "
2 | \n", + "5.502221 | \n", + "0.009782 | \n", + "1.641876 | \n", + "-1.614253 | \n", + "5.917917 | \n", + "
3 | \n", + "0.486679 | \n", + "3.175171 | \n", + "-1.802012 | \n", + "5.048345 | \n", + "9.231772 | \n", + "
4 | \n", + "-0.209661 | \n", + "-0.869410 | \n", + "4.302149 | \n", + "-1.123776 | \n", + "10.237087 | \n", + "
\n", + " | article | \n", + "section | \n", + "tech_health_tag | \n", + "article_filter | \n", + "
---|---|---|---|---|
0 | \n", + "hamburg reuters germany and poland are discussing new action to prevent the spread of the pig disease african swine fever asf in wild boars in po... | \n", + "Health News | \n", + "health | \n", + "hamburg reuters germany poland discussing action prevent spread disease african swine fever wild boar poland close german border including border ... | \n", + "
1 | \n", + "bits the trajectory of the world’s biggest public company these days appears to be a zigzag. three months ago that company — apple — posted its fi... | \n", + "technology | \n", + "technology | \n", + "bit trajectory world biggest public company day appears zigzag three month company apple posted first yearoveryear revenue decline year sale iphon... | \n", + "
\n", + " | Document | \n", + "Class | \n", + "
---|---|---|
0 | \n", + "hamburg reuters germany poland discussing action prevent spread disease african swine fever wild boar poland close german border including border ... | \n", + "0 | \n", + "
1 | \n", + "bit trajectory world biggest public company day appears zigzag three month company apple posted first yearoveryear revenue decline year sale iphon... | \n", + "1 | \n", + "
Hello World !!
', unsafe_allow_html=True) + + + # with st.form(key="my_form"): + col1, col2, col3 = st.columns([3,1,3]) + + with col1: + prompts = list(EXAMPLES.keys()) + ["Select a document"] + prompt = st.selectbox( + 'Example Inputs', + prompts, + index=2 + ) + + if prompt == "Select a document": + prompt_box = "" + else: + prompt_box = EXAMPLES[prompt] + + + with col3: + + uploaded_file = col3.file_uploader("Upload pdf document", type=".pdf") + if uploaded_file: + # creating a pdf file object + # pdfFileObj = StringIO(uploaded_file.getvalue().decode("utf-8")) + + # creating a pdf reader object + pdfReader = PyPDF2.PdfFileReader(uploaded_file) + + # printing number of pages in pdf file + print(pdfReader.numPages) + + # creating a page object + pageObj = pdfReader.getPage(0) + prompt_box = pageObj.extractText() + # closing the pdf file object + # pdfFileObj.close() + + doc_txt = st.text_area( + "Document:", + prompt_box, height=200 + ) + submit_button = st.button(label="Generate topics") + + + # if not doc_txt: + # st.stop() # pop up message + + + if submit_button: + if doc_txt != "" and len(doc_txt.split(" ")) > 12: + with st.spinner("Generating topics..."): + data = {"document": {"0": doc_txt}} + topics = get_topics(url, data) + + st.markdown("Model Output") + + tab1_result, tab2_result = st.tabs(["Result Tables", "Result Wordcloud" ]) + + st.header("") + df_global = pd.DataFrame(topics['topics']['0']['global'].items()) + df_global['label'], df_global['topics'] = df_global[1].apply(lambda x: x['labels']), df_global[1].apply(lambda x: x['topics']) + df_global = df_global.set_index(df_global[0]) + df_global.drop(1, axis=1, inplace=True) + + df_local = pd.DataFrame(topics['topics']['0']['local'].items()) + df_local['label'], df_local['topics'] = df_local[1].apply(lambda x: x['labels']), df_local[1].apply(lambda x: x['topics']) + df_local = df_local.set_index(df_local[0]) + df_local.drop(1, axis=1, inplace=True) + + with tab1_result: + + st.header("Global Topics") + + st.table(df_global) + + st.header("Local Topics") + + st.table(df_local) + + with tab2_result: + global_topics = df_global['topics'].tolist() + global_labels = df_global['label'].tolist() + local_topics = df_local['topics'].tolist() + local_labels = df_local['label'].tolist() + global_topic_label = global_topics + global_labels + local_topic_label = local_topics + local_labels + col4, col_, col5 = st.columns([2,1,2]) + + with col4: + st.header("Global Topics as a wordcloud") + plot_wordcloud(global_topic_label) + + + with col5: + + st.header("Local Topics as a wordcloud") + plot_wordcloud(local_topic_label) + else: + st.warning('Please insert a document', icon="⚠️") +with tab2: + with st.expander("ℹ️ Named Entity Recognition", expanded=True): + + st.write( + """ + Named Entity Recognition is the task of identifying named entities (people, locations, organizations, etc.) in the input text. + + """ + ) + + tab3, tab4 = st.tabs(["Demo", "Model Info"]) + + with tab3: + prompts_ner = list(NER_EXAMPLES.keys()) + ["Select a Sentence"] + prompt_ner = st.selectbox( + 'Example Document', + prompts_ner, + index=3 + ) + + if prompt_ner == "Select a Sentence": + prompt_box = "" + else: + prompt_box = NER_EXAMPLES[prompt_ner] + + sent_txt = st.text_area( + "Sentence:", + prompt_box, height=100) + submit_button_ner = st.button(label="Run Model") + if submit_button_ner: + if sent_txt != "": + with st.spinner("Generating entities..."): + sent_data = {"sentence": sent_txt} + ner_output = get_ner(url_ner, sent_data) + + st.markdown("Model Output") + st.markdown("Entities") + tokens_ner = ner_output['ner_tags']['tokens'][1:-1] + labels_ner = ner_output['ner_tags']['labels'][1:-1] + + print(zip(tokens_ner, labels_ner)) + annotated_list = [] + ner_entities = ['per','gpe','geo','art','eve','org','tim','nat'] + + for i,token_label in enumerate(zip(tokens_ner, labels_ner)): + token, label = token_label[0], token_label[1] + if label.lower() not in ['o', 'pad']: + tag = label.split("-") + if tag[0] == "B": + collector = token + flag = True + j = i+1 + while flag: + if labels_ner[j].lower() not in ['o', 'pad']: + if labels_ner[j].split("-")[1] != tag[1]: + flag = False + else: + collector = collector + " " +tokens_ner[j] + j += 1 + else: + flag = False + annotated_list.append((collector, tag[1])) + else: + annotated_list.append(token+" ") + print(annotated_list) + # st.write(annotated_list) + annotated_text(*annotated_list) + with tab4: + data_path = os.getcwd() + os.path.join(data_path,"data/modelcard.csv") + df = pd.read_csv(os.path.join(data_path,"data/modelcard.csv"), sep=',') + # df = df.rename(columns={'0':'','1':''}) + + st.table(df) + diff --git a/ui-frontend/app/ner_examples.py b/ui-frontend/app/ner_examples.py new file mode 100644 index 0000000000000000000000000000000000000000..699be3e29425cf5abe6350fbf00450470297e63c --- /dev/null +++ b/ui-frontend/app/ner_examples.py @@ -0,0 +1,7 @@ +NER_EXAMPLES = { + + "Sentence 1":"Andrew Ng is a professor at Stanford University.", + "Sentence 2":"AllenNLP is a PyTorch-based natural language processing library developed at the Allen Institute for Artificial Intelligence in Seattle.", + "Sentence 3":"Jeff Bezos founded e-commerce giant Amazon in 1994 out of his garage in Seattle.", + +} \ No newline at end of file diff --git a/ui-frontend/app/requirements.txt b/ui-frontend/app/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e55492de03830ab0f7636c76cc2c2df54ea44d9 --- /dev/null +++ b/ui-frontend/app/requirements.txt @@ -0,0 +1,8 @@ +streamlit +pandas +numpy +PyPDF2 +st-annotated-text +wordcloud +Pillow +matplotlib \ No newline at end of file