diff --git "a/scipapers_scatter.ipynb" "b/scipapers_scatter.ipynb" new file mode 100644--- /dev/null +++ "b/scipapers_scatter.ipynb" @@ -0,0 +1,1573 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1619b254fcbb4cb880d1be5685c74dbc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_607c048fb1634a7689e355036c144984", + "IPY_MODEL_869501d4d38e46f184a66423d93a2745", + "IPY_MODEL_c03dc1381a0c430182fe86d8a100b249" + ], + "layout": "IPY_MODEL_b6d31f4cebc84ef0a563d41482b14cc2" + } + }, + "607c048fb1634a7689e355036c144984": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9ebec18dbdff4913a4902429a726b9e0", + "placeholder": "​", + "style": "IPY_MODEL_c9dc6fbcf53a4c9fb53716a18db6ffbe", + "value": "Map: 100%" + } + }, + "869501d4d38e46f184a66423d93a2745": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c9ef5bf8ff3e44358c4557f74c3e379e", + "max": 1249, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0cc5f439950e49eaa4d417396e21e2c4", + "value": 1249 + } + }, + "c03dc1381a0c430182fe86d8a100b249": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1479cc60b4ac4864b46b592dc1050157", + "placeholder": "​", + "style": "IPY_MODEL_a9e75caedfbf46e0bd0effe1e60065cd", + "value": " 1249/1249 [00:02<00:00, 454.98 examples/s]" + } + }, + "b6d31f4cebc84ef0a563d41482b14cc2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9ebec18dbdff4913a4902429a726b9e0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c9dc6fbcf53a4c9fb53716a18db6ffbe": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c9ef5bf8ff3e44358c4557f74c3e379e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0cc5f439950e49eaa4d417396e21e2c4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1479cc60b4ac4864b46b592dc1050157": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9e75caedfbf46e0bd0effe1e60065cd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4c7b67b7151e4c9fb47eaae2f39a21b8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5bd552c8824e407c934978e35e7de980", + "IPY_MODEL_d052c01440db4dafb5d699eb57a9d613", + "IPY_MODEL_e1bbc114c9054a28a48762831a44ef11" + ], + "layout": "IPY_MODEL_591bf12de23c41c6aa510f6d6702b30e" + } + }, + "5bd552c8824e407c934978e35e7de980": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5b379ade011143b9bf21c2aedaaf9149", + "placeholder": "​", + "style": "IPY_MODEL_26062d5edbee4879a66829962199ca43", + "value": "encoding: 100%" + } + }, + "d052c01440db4dafb5d699eb57a9d613": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c65d6c4a6d0a44d2a9fb8ca75cc5f790", + "max": 20, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3bb8adf35cf74c3cbd3d2c58912041a3", + "value": 20 + } + }, + "e1bbc114c9054a28a48762831a44ef11": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cc2125fcf9ab49eb9e2be054a4c3fc18", + "placeholder": "​", + "style": "IPY_MODEL_2ff93c21f097436f9ccd61a8c9c8010d", + "value": " 20/20 [00:32<00:00, 1.47s/it]" + } + }, + "591bf12de23c41c6aa510f6d6702b30e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5b379ade011143b9bf21c2aedaaf9149": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "26062d5edbee4879a66829962199ca43": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c65d6c4a6d0a44d2a9fb8ca75cc5f790": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3bb8adf35cf74c3cbd3d2c58912041a3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "cc2125fcf9ab49eb9e2be054a4c3fc18": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2ff93c21f097436f9ccd61a8c9c8010d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Navigating Scientific Papers in 2D Scatter Plots\n", + "A simple way to get a glimpse of how scientific papers are related to one another is to plot their projections on a 2D plain, similar to https://huggingface.co/spaces/gwf-uwaterloo/aclscatter2d.\n", + "\n", + "This notebook provides steps to visualize papers from the [ACL Anthology](https://aclanthology.org/). For this purpose, we first embed papers using a model (e.g. [spectre2](https://huggingface.co/allenai/specter2_base) by default) into dense representations. After clustering them, we apply t-SNE to project them into 2 dimensions for visualization.\n", + "\n", + "**Before running this colab, make sure the runtime type is set to GPU.** We check the availability of GPUs in the \"Checks\" section.\n", + "\n", + "The plot will be generated using [plotly](https://plotly.com/python/getting-started/)." + ], + "metadata": { + "id": "AeaHYgzwgyOF" + } + }, + { + "cell_type": "code", + "source": [ + "# @title XML file name to download from acl-anthology github page\n", + "FILE_NAME = '2023.acl.xml' # @param {type:\"string\"}" + ], + "metadata": { + "cellView": "form", + "id": "mQ31dArhTOmd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title Model name from huggingface\n", + "MODEL_NAME = 'allenai/specter2_base' # @param {type:\"string\"}\n", + "\n", + "ADAPTER_NAME = \"\" # @param {type:\"string\"}" + ], + "metadata": { + "cellView": "form", + "id": "jSt0Jpueanvn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title Inference args\n", + "BATCH_SIZE = 64 # @param {type:\"integer\"}" + ], + "metadata": { + "cellView": "form", + "id": "HryCbmPBcw5V" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title Visualization args\n", + "NUM_CLUSTERS = 50 # @param {type:\"integer\"}" + ], + "metadata": { + "cellView": "form", + "id": "qyedQTz5ezl4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Setup" + ], + "metadata": { + "id": "jXbz3X1sUHcr" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Install dependencies" + ], + "metadata": { + "id": "O9n1VhtvUQxS" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install datasets\n", + "!pip install transformers\n", + "!pip install adapter-transformers==3.0.1" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d0XchP9jUOhb", + "outputId": "133dcd54-f647-44bf-e5d4-f91383be6640" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution -lotly (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.14.5)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.23.5)\n", + "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", + "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.1)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.3.0)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n", + "Requirement already satisfied: fsspec[http]<2023.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.5)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.17.2)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (3.2.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (3.12.2)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (4.5.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2023.7.22)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3.post1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -lotly (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -lotly (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.33.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.15.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.17.2)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.15.1->transformers) (2023.6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.15.1->transformers) (4.5.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -lotly (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -lotly (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: adapter-transformers==3.0.1 in /usr/local/lib/python3.10/dist-packages (3.0.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (3.12.2)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (0.17.2)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (1.23.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (23.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (2023.6.3)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (2.31.0)\n", + "Requirement already satisfied: sacremoses in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (0.0.53)\n", + "Requirement already satisfied: tokenizers!=0.11.3,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (0.13.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from adapter-transformers==3.0.1) (4.66.1)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.1.0->adapter-transformers==3.0.1) (2023.6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.1.0->adapter-transformers==3.0.1) (4.5.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->adapter-transformers==3.0.1) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->adapter-transformers==3.0.1) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->adapter-transformers==3.0.1) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->adapter-transformers==3.0.1) (2023.7.22)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from sacremoses->adapter-transformers==3.0.1) (1.16.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from sacremoses->adapter-transformers==3.0.1) (8.1.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from sacremoses->adapter-transformers==3.0.1) (1.3.2)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -lotly (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Imports" + ], + "metadata": { + "id": "c0MMhYc_UKfG" + } + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "import os\n", + "import re\n", + "from functools import partial\n", + "from tqdm.auto import tqdm\n", + "from typing import Any, Iterable, Mapping\n", + "\n", + "import datasets\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "from transformers import DataCollatorWithPadding, AutoModel, AutoTokenizer, AutoConfig\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.manifold import TSNE\n", + "\n", + "import plotly.express as px" + ], + "metadata": { + "id": "AJULv3wPUG0z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Checks" + ], + "metadata": { + "id": "BY2W1tBTUVWN" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown **Check GPU type**\n", + "!nvidia-smi -L\n", + "\n", + "#@markdown **Check PyTorch version**\n", + "print(\"PyTorch version:\", torch.__version__)\n", + "print(\"CUDA version:\", torch.version.cuda)\n", + "print(\"#GPUs:\", torch.cuda.device_count())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "cellView": "form", + "id": "jtYjxTfuUXUb", + "outputId": "4f62a4ba-8b8b-462d-caa6-e002ec2d7b1b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "GPU 0: Tesla T4 (UUID: GPU-5e2802f0-3a72-ee6b-56ce-fc17d7e725c4)\n", + "PyTorch version: 2.0.1+cu118\n", + "CUDA version: 11.8\n", + "#GPUs: 1\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Load Huggingface Stuff" + ], + "metadata": { + "id": "osH8mbM4aCw0" + } + }, + { + "cell_type": "code", + "source": [ + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "\n", + "config = AutoConfig.from_pretrained(MODEL_NAME, return_dict=True, output_hidden_states=True)\n", + "\n", + "model = AutoModel.from_pretrained(MODEL_NAME, config=config)\n", + "if ADAPTER_NAME:\n", + " model.load_adapter(\n", + " ADAPTER_NAME,\n", + " source=\"hf\",\n", + " set_active=True,\n", + " )\n", + "\n", + "model.eval()\n", + "model.to(\"cuda\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6j9EGcCSZ8Z_", + "outputId": "1edfabc5-35b0-47d6-8c58-8cf1e35ca5fe" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "BertModel(\n", + " (shared_parameters): ModuleDict()\n", + " (invertible_adapters): ModuleDict()\n", + " (embeddings): BertEmbeddings(\n", + " (word_embeddings): Embedding(31090, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (token_type_embeddings): Embedding(2, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): BertEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (prefix_tuning): PrefixTuningShim(\n", + " (pool): PrefixTuningPool(\n", + " (prefix_tunings): ModuleDict()\n", + " )\n", + " )\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (adapters): ModuleDict()\n", + " (adapter_fusion_layer): ModuleDict()\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (adapters): ModuleDict()\n", + " (adapter_fusion_layer): ModuleDict()\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): BertPooler(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + " (prefix_tuning): PrefixTuningPool(\n", + " (prefix_tunings): ModuleDict()\n", + " )\n", + ")" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Preparing Data" + ], + "metadata": { + "id": "v9olGFFaP6Un" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Downloading from acl-anthology github" + ], + "metadata": { + "id": "YvFxyYEpP_wj" + } + }, + { + "cell_type": "markdown", + "source": [ + "The paper information can be downloaded from `acl-anthology` github page in the XML format: https://github.com/acl-org/acl-anthology/tree/master/data/xml/" + ], + "metadata": { + "id": "Vm022cIzSorc" + } + }, + { + "cell_type": "code", + "source": [ + "!rm -f $FILE_NAME\n", + "!wget \"https://raw.githubusercontent.com/acl-org/acl-anthology/master/data/xml/$FILE_NAME\"\n", + "\n", + "assert os.path.exists(FILE_NAME), \"Downloaded file exists\"" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "knMDRgK8Sfl_", + "outputId": "ea0abab7-fe9f-4ffa-e627-1b3d4f5a8953" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-09-20 03:28:48-- https://raw.githubusercontent.com/acl-org/acl-anthology/master/data/xml/2023.acl.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2597735 (2.5M) [text/plain]\n", + "Saving to: ‘2023.acl.xml’\n", + "\n", + "2023.acl.xml 100%[===================>] 2.48M --.-KB/s in 0.02s \n", + "\n", + "2023-09-20 03:28:49 (142 MB/s) - ‘2023.acl.xml’ saved [2597735/2597735]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "download the xml file from this [link](https://github.com/acl-org/acl-anthology/tree/006c7247a6bf0ff859bfd3aab6ea6a19452580ad/data/xml). \n", + "Convert the xml files to jsonl files by running the following code" + ], + "metadata": { + "id": "2KFobPmUbu7j" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Parsing" + ], + "metadata": { + "id": "CUD4LOJlUmMj" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WXQgTZQ103g7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4edc4fd1-0a7f-4419-ffa3-e1d9f259a139" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "#papers founds in 2023.acl.xml: 1249\n" + ] + } + ], + "source": [ + "import xml.etree.ElementTree as ET\n", + "\n", + "URL_MAPPINGS = dict(\n", + " D=\"emnlp\",\n", + " N=\"naacl\",\n", + " P=\"acl\",\n", + " Q=\"tacl\",\n", + ")\n", + "\n", + "def xml_to_jsonl(xml_file: os.PathLike) -> Iterable[Mapping[str, Any]]:\n", + " tree = ET.parse(xml_file)\n", + " root = tree.getroot()\n", + " papers = root.findall(\".//paper\")\n", + "\n", + " for paper in papers:\n", + " paper_dict = {}\n", + " paper_dict[\"title\"] = \"\".join(paper.find(\"title\").itertext())\n", + "\n", + " authors = []\n", + " for author in paper.findall(\"author\"):\n", + " first_name = author.findtext(\"first\")\n", + " last_name = author.findtext(\"last\")\n", + " authors.append(f\"{first_name} {last_name}\")\n", + " paper_dict[\"authors\"] = authors\n", + "\n", + " paper_dict[\"abstract\"] = \"\" if paper.find(\"abstract\")==None else \"\".join(paper.find(\"abstract\").itertext())\n", + " paper_dict[\"pages\"] = paper.findtext(\"pages\")\n", + " paper_dict[\"url\"] = paper.findtext(\"url\")\n", + " paper_dict[\"bibkey\"] = paper.findtext(\"bibkey\")\n", + " paper_dict[\"doi\"] = paper.findtext(\"doi\")\n", + "\n", + " conference, paper_type = None, None\n", + " matched = re.match(r\"(\\d+)\\.(\\w+)-(\\w+)\\.\\d+\", paper_dict[\"url\"])\n", + " if matched:\n", + " year = int(matched.group(1))\n", + " conference = matched.group(2)\n", + " paper_type = matched.group(3)\n", + " else:\n", + " bibs = paper_dict[\"bibkey\"].split(\"-\")\n", + " for b in range(len(bibs) - 1, -1, -1):\n", + " try:\n", + " year = int(bibs[b])\n", + " break\n", + " except ValueError:\n", + " pass\n", + "\n", + " conference = URL_MAPPINGS.get(paper_dict[\"url\"][0], None)\n", + "\n", + " paper_dict[\"source\"] = conference\n", + " paper_dict[\"year\"] = year\n", + " paper_dict[\"publication_type\"] = paper_type\n", + "\n", + " yield paper_dict\n", + "\n", + "papers = list(xml_to_jsonl(FILE_NAME))\n", + "\n", + "print(f\"#papers founds in {FILE_NAME}: {len(papers)}\")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Encode" + ], + "metadata": { + "id": "3yXoFyHhdd25" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Creating DataLoader" + ], + "metadata": { + "id": "ml0g17tYX2jP" + } + }, + { + "cell_type": "code", + "source": [ + "dataset = datasets.Dataset.from_list(\n", + " [{\"text\": p[\"title\"] + tokenizer.sep_token + (p[\"abstract\"] or \"\"), \"idx\": i + 1} for i, p in enumerate(papers)]\n", + ")\n", + "\n", + "tokenize_fn = lambda batch: tokenizer(batch[\"text\"], padding=True, truncation=True, max_length=512)\n", + "dataset = dataset.map(tokenize_fn, batched=True)\n", + "\n", + "columns = [\"idx\", \"input_ids\", \"attention_mask\"]\n", + "if \"token_type_ids\" in dataset.column_names:\n", + " columns.append(\"token_type_ids\")\n", + "\n", + "data_loader = DataLoader(\n", + " dataset.with_format(\"torch\", columns=columns),\n", + " collate_fn=DataCollatorWithPadding(tokenizer),\n", + " batch_size=BATCH_SIZE,\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 153, + "referenced_widgets": [ + "1619b254fcbb4cb880d1be5685c74dbc", + "607c048fb1634a7689e355036c144984", + "869501d4d38e46f184a66423d93a2745", + "c03dc1381a0c430182fe86d8a100b249", + "b6d31f4cebc84ef0a563d41482b14cc2", + "9ebec18dbdff4913a4902429a726b9e0", + "c9dc6fbcf53a4c9fb53716a18db6ffbe", + "c9ef5bf8ff3e44358c4557f74c3e379e", + "0cc5f439950e49eaa4d417396e21e2c4", + "1479cc60b4ac4864b46b592dc1050157", + "a9e75caedfbf46e0bd0effe1e60065cd" + ] + }, + "id": "sCG1iVa4X7ye", + "outputId": "a287df82-3448-4b24-9e26-582bd7b4b180" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Map: 0%| | 0/1249 [00:00 5:\n", + " return \", \".join(list_of_authors[:5]) + \", et al.\"\n", + " elif len(list_of_authors) > 2:\n", + " return \", \".join(list_of_authors[:-1]) + \", and \" + list_of_authors[-1]\n", + " else:\n", + " return \" and \".join(list_of_authors)\n", + "\n", + "\n", + "for i, (point, c, p) in enumerate(zip(reduced_embeds, clusters, papers)):\n", + " p[\"x\"] = point[0]\n", + " p[\"y\"] = point[1]\n", + " p[\"cluster\"] = c\n", + " p[\"authors_trimmed\"] = [(x[x.index(\",\") + 1 :].strip() + \" \" + x.split(\",\")[0].strip()) if \",\" in x else x for x in p[\"authors\"]]\n", + " if \"publication_type\" in p:\n", + " p[\"type\"] = p.pop(\"publication_type\")\n", + "\n", + "df = pd.DataFrame(papers)\n", + "\n", + "fig = px.scatter(\n", + " df,\n", + " x=\"x\",\n", + " y=\"y\",\n", + " color=\"cluster\",\n", + " width=1000,\n", + " height=800,\n", + " custom_data=(\"title\", \"authors_trimmed\", \"year\", \"source\", \"type\"),\n", + " color_continuous_scale=\"fall\",\n", + ")\n", + "fig.update_traces(\n", + " hovertemplate=\"%{customdata[0]}
%{customdata[1]}
%{customdata[2]}
%{customdata[3]}\"\n", + ")\n", + "fig.update_layout(\n", + " showlegend=False,\n", + " font=dict(\n", + " family=\"Times New Roman\",\n", + " size=30,\n", + " ),\n", + " hoverlabel=dict(\n", + " align=\"left\",\n", + " font_size=14,\n", + " font_family=\"Rockwell\",\n", + " namelength=-1,\n", + " ),\n", + ")\n", + "fig.update_xaxes(title=\"\")\n", + "fig.update_yaxes(title=\"\")\n", + "\n", + "a = fig.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 817 + }, + "cellView": "form", + "id": "B-TwYJM5gtF-", + "outputId": "99a5d7d7-2e49-43af-be93-7677c50effba" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + } + ] +} \ No newline at end of file