{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OjQECQomO51P", "outputId": "5dfd6f54-fa54-4291-e1f4-197f867f5af7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting sentence-transformers\n", " Downloading sentence-transformers-2.2.2.tar.gz (85 kB)\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/86.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.0/86.0 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.35.2)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.66.1)\n", "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.1.0+cu121)\n", "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.0+cu121)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.23.5)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.11.4)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", "Collecting sentencepiece (from sentence-transformers)\n", " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.19.4)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (3.13.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.6.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2.31.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (6.0.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (4.5.0)\n", "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (23.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.2.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.1.2)\n", "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (2.1.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (2023.6.3)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (0.15.0)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (0.4.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (8.1.7)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence-transformers) (9.4.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence-transformers) (2.1.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (3.6)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2023.11.17)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence-transformers) (1.3.0)\n", "Building wheels for collected packages: sentence-transformers\n", " Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=1d52df49cef1d61dc03e7ba01c0fa3c54702cdf6ba8fb1eadc84908d0388c5f4\n", " Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f\n", "Successfully built sentence-transformers\n", "Installing collected packages: sentencepiece, sentence-transformers\n", "Successfully installed sentence-transformers-2.2.2 sentencepiece-0.1.99\n" ] } ], "source": [ "pip install -U sentence-transformers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SZM2jEH8Pj-F", "outputId": "cdb541d1-47ee-4edf-c068-cfea4097509d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: hnswlib in /usr/local/lib/python3.10/dist-packages (0.8.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from hnswlib) (1.23.5)\n" ] } ], "source": [ "pip install hnswlib" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "n-0NKesbP3mh", "outputId": "ac8b352f-72b1-4f93-e36d-817bc0aff9a3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.66.1)\n" ] } ], "source": [ "pip install tqdm" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QYGY5tvvP91W", "outputId": "4196c84b-7b7b-413e-dba8-59fce568c4f2" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "# Download the NLTK stopwords\n", "import nltk\n", "nltk.download('punkt')\n", "from nltk.corpus import stopwords\n", "import pickle\n", "import hnswlib\n", "import sentence_transformers as st\n", "from sentence_transformers import SentenceTransformer, util\n", "import time\n", "from tqdm import tqdm\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UNTPJzEsQ-TV" }, "outputs": [], "source": [ "resumes = pd.read_csv(\"/content/resumes25000+.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SCB8sF8xREFt" }, "outputs": [], "source": [ "Preprop_resumes = resumes['Resumes']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "h4yrDNzWRKUX", "outputId": "3b272262-cb6c-47a8-ee4d-d6d9c96295e1" }, "outputs": [ { "data": { "text/plain": [ "0 Full Stack Web Development Teachers Assistant ...\n", "1 Quality Assurance Lead Quality Assurance Lead ...\n", "2 Front End Developer Santa Rosa, CA Authorized ...\n", "3 Student Student Student - New Horizons Compute...\n", "4 Sales Associate/ Part-time Sales Associate/ Pa...\n", " ... \n", "25609 Systems Administrator II Darwin, MN Professio...\n", "25610 Systems Administrator Why fit in when you wer...\n", "25611 System Administrator Panama City, FL Authorize...\n", "25612 Systems Administrator I - Direct Distributors...\n", "25613 Systems Administrator - Alfun Consulting Broo...\n", "Name: Resumes, Length: 25614, dtype: object" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Preprop_resumes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4U5IfGwORMrY", "outputId": "7351954f-c6cf-4e9d-af77-5954fbdfee0f" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ROom4dN9RQXl" }, "outputs": [], "source": [ "# Function for cleaning and preprocessing the resume\n", "def clean_resume(resume):\n", " if isinstance(resume, str):\n", " # Convert to lowercase\n", " resume = resume.lower()\n", "\n", " # Remove URLs, RT, cc, hashtags, mentions, non-ASCII characters, punctuation, and extra whitespace\n", " resume = re.sub('http\\S+\\s*|RT|cc|#\\S+|@\\S+|[^\\x00-\\x7f]|[^\\w\\s]', ' ', resume)\n", " resume = re.sub('\\s+', ' ', resume).strip()\n", "\n", " # Tokenize the resume\n", " tokens = nltk.word_tokenize(resume)\n", "\n", " # Remove stopwords\n", " stop_words = set(stopwords.words('english'))\n", " tokens = [token for token in tokens if token.lower() not in stop_words]\n", "\n", " # Join the tokens back into a sentence\n", " preprocessed_resume = ' '.join(tokens)\n", "\n", " return preprocessed_resume\n", " else:\n", " return ''\n", "# Applying the cleaning function to a DataFrame column\n", "Preprop_resumes = Preprop_resumes.apply(lambda x: clean_resume(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NJavXFK7RVp0" }, "outputs": [], "source": [ "resumes = resumes['Resumes'].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "y1Xu2OzBWBgC" }, "outputs": [], "source": [ "#Load model from HuggingFace Hub\n", "from transformers import AutoTokenizer, AutoModel\n", "import torch\n", "from tqdm import tqdm\n", "# Load model from HuggingFace Hub\n", "tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')\n", "model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1EAqfOR9gNvb" }, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "import torch\n", "from tqdm import tqdm\n", "\n", "\n", "def embed_resumes_with_progress(model, tokenizer, resumes, chunk_size=200):\n", " \"\"\"\n", " Embeds a list of resumes using the SentenceTransformer model with chunking and progress bar.\n", "\n", " Args:\n", " model: The SentenceTransformer model.\n", " tokenizer: The Hugging Face Tokenizer for text pre-processing.\n", " resumes: A list of preprocessed resumes.\n", " chunk_size: Maximum number of tokens per chunk (default: 200).\n", "\n", " Returns:\n", " A numpy array containing the averaged embeddings for each resume.\n", " \"\"\"\n", "\n", " resume_embeddings = []\n", "\n", " use_cuda = torch.cuda.is_available()\n", " device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n", "\n", " if use_cuda:\n", " model.cuda()\n", "\n", " with tqdm(total=len(resumes)) as pbar:\n", " for resume in resumes:\n", " encoded_chunks = []\n", " chunks = [resume[i:i + chunk_size] for i in range(0, len(resume), chunk_size)]\n", "\n", " for chunk in chunks:\n", " encoded_chunk = tokenizer(chunk, padding=True, truncation=True, return_tensors=\"pt\")\n", " encoded_chunk.to(device)\n", "\n", " with torch.no_grad():\n", " chunk_embedding = model(**encoded_chunk)[0]\n", " attention_mask = encoded_chunk[\"attention_mask\"]\n", " encoded_chunks.append(chunk_embedding.to(\"cpu\"))\n", "\n", " # Concatenate the encoded chunks (CPU)\n", " concatenated_chunks = torch.cat(encoded_chunks, dim=1)\n", " resume_embedding = torch.mean(concatenated_chunks, dim=1)\n", " resume_embeddings.append(resume_embedding)\n", "\n", " pbar.update(1)\n", "\n", " return torch.cat(resume_embeddings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "background_save": true, "base_uri": "https://localhost:8080/" }, "id": "psy3yK1hSWdi", "outputId": "1365af01-30b9-4686-bf0b-3e4c19b57065" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 67%|██████▋ | 17274/25614 [1:30:02<35:18, 3.94it/s]" ] } ], "source": [ "# Get resume embeddings\n", "import torch\n", "resume_embeddings = embed_resumes_with_progress(model, tokenizer, Preprop_resumes)\n", "\n", "# Access individual embedding\n", "resume_embeddings = resume_embeddings[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "25V744JsF8fv", "outputId": "0af664d3-e478-41b7-df65-69efb5f98488" }, "outputs": [ { "data": { "text/plain": [ "tensor([[-0.0068, 0.1328, -0.0602, ..., -0.0290, -0.0340, -0.0490],\n", " [ 0.0852, 0.0535, -0.0172, ..., -0.0350, 0.0221, -0.0468],\n", " [ 0.0327, 0.1439, -0.0488, ..., -0.0028, -0.0313, -0.0672],\n", " ...,\n", " [ 0.1147, 0.1450, -0.0392, ..., 0.0226, -0.0336, -0.0666],\n", " [ 0.1217, 0.0510, -0.0582, ..., 0.0107, -0.0487, -0.0583],\n", " [ 0.1242, 0.1156, -0.0432, ..., -0.0239, 0.0333, -0.0741]])" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "resume_embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lzjE40xEMV3E", "outputId": "bebfda36-3750-4769-d1b5-fef944e453bf" }, "outputs": [ { "data": { "text/plain": [ "torch.Size([9, 768])" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "resume_embeddings.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aURQbssdFQ49", "outputId": "666780b8-c08a-4ab7-bf75-f940893c3c5c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/gdrive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bV8nBxEQIKXy" }, "outputs": [], "source": [ "# Save the model to the specified directory\n", "model.save_pretrained(\"/content/gdrive/MyDrive/fine_tuned_mpnetwithchunking_v2\")\n", "tokenizer.save_pretrained('/content/gdrive/My Drive/fine_tuned_mpnetwithchunking_v2')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7vPDGLbhIKZO" }, "outputs": [], "source": [ "mpnetwith_chunking = ('Abaabookoo/mpnet_withchunking')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N4LQsH9vR1A0" }, "outputs": [], "source": [ "def clean_JD(JD):\n", " \"\"\"\n", " Preprocesses the provided JD by:\n", " - Lowercasing all text\n", " - Removing punctuation\n", " - Removing stop words and punctuation and sympols\n", " \"\"\"\n", " JD = JD.lower()\n", " JD = re.sub(r\"[^\\w\\s]\", \"\", JD)\n", " stop_words = stopwords.words(\"english\")\n", " filtered_words = [word for word in JD.split() if word not in stop_words]\n", " cleaned_JD = \" \".join(filtered_words)\n", " return cleaned_JD" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IBrQ2KLqTJj2" }, "outputs": [], "source": [ "job_description = \"\"\"\n", "\n", "Boyave\n", "Full-Time\n", "Description\n", "Content Creator\n", "Job brief\n", "We are looking for a Content Creator to write and publish various types of pieces for our company’s web pages, like articles, ebooks and social media posts.\n", "Content Creator responsibilities include producing marketing copy to advertise our products, writing blog posts about industry-related topics and promoting our content on social media. To be successful in this role, you should have experience with digital publishing and generating traffic and leads for new business. Please share samples of your work (portfolio or links to published articles) along with your application.\n", "Ultimately, you will help us reach our target audience by delivering both useful and appealing online information about our company and products.\n", "Responsibilities\n", "•\tResearch industry-related topics\n", "•\tPrepare well-structured drafts using digital publishing platforms\n", "•\tCreate and distribute marketing copy to advertise our company and products\n", "•\tInterview industry professionals and incorporate their views in blog posts\n", "•\tEdit and proofread written pieces before publication\n", "•\tConduct keyword research and use SEO guidelines to optimize content\n", "•\tPromote content on social networks and monitor engagement (e.g. comments and shares)\n", "•\tIdentify customers’ needs and recommend new topics\n", "•\tCoordinate with marketing and design teams to illustrate articles\n", "•\tMeasure web traffic to content (e.g. conversion and bounce rates)\n", "•\tUpdate our websites as needed\n", "Requirements and skills\n", "•\tProven work experience as a Content Creator, Copywriter or similar role\n", "•\tPortfolio of published articles\n", "•\tHands-on experience with Content Management Systems (e.g. WordPress)\n", "•\tExcellent writing and editing skills in English\n", "•\tAn ability to fact-check long-form content pieces\n", "•\tTime-management skills\n", "•\tFamiliarity with SEO\n", "•\tBSc in Marketing, English, Journalism or relevant field\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Stf7MUqXT1Aw", "outputId": "54a7511e-aa37-42b6-e2e1-612177887617" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cleaned Job Description: boyave fulltime description content creator job brief looking content creator write publish various types pieces companys web pages like articles ebooks social media posts content creator responsibilities include producing marketing copy advertise products writing blog posts industryrelated topics promoting content social media successful role experience digital publishing generating traffic leads new business please share samples work portfolio links published articles along application ultimately help us reach target audience delivering useful appealing online information company products responsibilities research industryrelated topics prepare wellstructured drafts using digital publishing platforms create distribute marketing copy advertise company products interview industry professionals incorporate views blog posts edit proofread written pieces publication conduct keyword research use seo guidelines optimize content promote content social networks monitor engagement eg comments shares identify customers needs recommend new topics coordinate marketing design teams illustrate articles measure web traffic content eg conversion bounce rates update websites needed requirements skills proven work experience content creator copywriter similar role portfolio published articles handson experience content management systems eg wordpress excellent writing editing skills english ability factcheck longform content pieces timemanagement skills familiarity seo bsc marketing english journalism relevant field\n" ] } ], "source": [ "cleaned_job_description = clean_JD(job_description)\n", "print(\"Cleaned Job Description:\", cleaned_job_description)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PjLu1p7-Lyw9" }, "outputs": [], "source": [ "import torch\n", "\n", "def embed_JD_with_progress(model, tokenizer, cleaned_job_description, chunk_size=200):\n", " \"\"\"\n", " Embeds a job description using the SentenceTransformer model with chunking and progress bar.\n", "\n", " Args:\n", " model: The SentenceTransformer model.\n", " tokenizer: The Hugging Face Tokenizer for text pre-processing.\n", " cleaned_job_description: A preprocessed job description string.\n", " chunk_size: Maximum number of tokens per chunk (default: 200).\n", "\n", " Returns:\n", " A numpy array containing the embedding for the job description.\n", " \"\"\"\n", "\n", " encoded_chunks = []\n", " chunks = [cleaned_job_description[i:i+chunk_size] for i in range(0, len(cleaned_job_description), chunk_size)]\n", "\n", " use_cuda = torch.cuda.is_available()\n", " device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n", " model.to(device) # Move model to GPU or CPU\n", "\n", " with tqdm(total=len(chunks), desc=\"Embedding Job Description\") as pbar:\n", " for chunk in chunks:\n", " encoded_chunk = tokenizer(chunk, padding=True, truncation=True, return_tensors=\"pt\").to(device) # Move input to device\n", " with torch.no_grad():\n", " chunk_embedding = model(**encoded_chunk)[0]\n", " attention_mask = encoded_chunk[\"attention_mask\"]\n", " encoded_chunks.append(chunk_embedding.to(\"cpu\")) # Move output back to CPU\n", " pbar.update(1)\n", "\n", " concatenated_chunks = torch.cat(encoded_chunks, dim=1)\n", " JD_embeddings = torch.mean(concatenated_chunks, dim=1)\n", " return JD_embeddings.numpy()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XVcxz9m5UA_3", "outputId": "9e575368-0865-43cc-b9f8-831942b322a9" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Embedding Job Description: 100%|██████████| 8/8 [00:00<00:00, 87.36it/s]\n" ] } ], "source": [ "# Get resume embeddings\n", "import torch\n", "JD_embeddings = embed_JD_with_progress(model, tokenizer, cleaned_job_description)\n", "\n", "# Access individual embedding\n", "first_JD_embedding = JD_embeddings[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Cf98RqMxMLfi", "outputId": "29387588-b90a-473e-8e1c-1d46ced327c0" }, "outputs": [ { "data": { "text/plain": [ "(1, 768)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "JD_embeddings.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Xt6ldCC9UFps" }, "outputs": [], "source": [ "def similarity_percentage(similarity_score):\n", " if similarity_score < 0.2:\n", " return 0\n", " elif 0.2 <= similarity_score < 0.3:\n", " return similarity_score - 0.25\n", " elif 0.3 <= similarity_score < 0.4:\n", " return similarity_score - 0.23\n", " elif 0.4 <= similarity_score < 0.55:\n", " return similarity_score - 0.19\n", " elif 0.55 <= similarity_score < 0.65:\n", " return similarity_score - 0.14\n", " else:\n", " return similarity_score - 0.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "I2COAaJuULH-" }, "outputs": [], "source": [ "def create_hnsw_index(embeddings, max_elements, ef_construction, M, ef):\n", " \"\"\"\n", " Creates and initializes an HNSWLib index with the specified parameters.\n", "\n", " Args:\n", " embeddings: A list of embedding vectors.\n", " max_elements: Maximum number of elements to store in the index.\n", " ef_construction: Number of elements to consider during index construction.\n", " M: Maximum number of connections per node in the HNSW graph.\n", " ef: Number of elements to consider during search.\n", "\n", " Returns:\n", " An HNSWLib index object.\n", " \"\"\"\n", " embedding_size = 728\n", " index = hnswlib.Index(space='cosine', dim=embedding_size)\n", " index.init_index(max_elements, ef_construction, M)\n", " index.add_items(resume_embeddings, list(range(len(resume_embeddings))))\n", " index.set_ef(ef)\n", " return index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 366 }, "id": "_b1NVudOUS_t", "outputId": "f06d6086-8d33-41d7-d10a-cbba96da9493" }, "outputs": [ { "ename": "RuntimeError", "evalue": "ignored", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mM\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m200\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mef\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m50\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_hnsw_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresume_embeddings\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_elements\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mef_construction\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mM\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mef\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Corpus loaded with {} resumes / embeddings\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresume_embeddings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mcreate_hnsw_index\u001b[0;34m(embeddings, max_elements, ef_construction, M, ef)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhnswlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIndex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'cosine'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0membedding_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minit_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_elements\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mef_construction\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mM\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_items\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresume_embeddings\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresume_embeddings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_ef\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mef\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mRuntimeError\u001b[0m: Wrong dimensionality of the vectors" ] } ], "source": [ "# Define parameters for the index\n", "max_elements = len(resume_embeddings)\n", "ef_construction = 2000\n", "M = 200\n", "ef = 50\n", "index = create_hnsw_index(resume_embeddings, max_elements, ef_construction, M, ef)\n", "print(\"Corpus loaded with {} resumes / embeddings\".format(len(resume_embeddings)))\n", "\n", "# Retrieve resumes based on job description\n", "take_k_hits = int(input(\"\\nHow many top resumes do you want to be retrieved?\\n\\n\"))\n", "\n", "start_time = time.time()\n", "\n", "resume_ids, dist = index.knn_query(JD_embeddings, take_k_hits)\n", "\n", "# Calculate the similarity percentage and create a DataFrame\n", "hits = [{'resume_id': id, 'Original_Score': 1 - score, 'Adjusted_Score': similarity_percentage(1 - score)} for id, score in zip(resume_ids[0], dist[0])]\n", "hits = sorted(hits, key=lambda x: x['Adjusted_Score'], reverse=True)\n", "\n", "end_time = time.time()\n", "\n", "print(\"Results (after {:.3f} seconds):\".format(end_time - start_time))\n", "\n", "# Create a DataFrame with original and adjusted similarity scores\n", "Resumeranking = pd.DataFrame(hits[:take_k_hits])\n", "Resumeranking['Resumes'] = Resumeranking['resume_id'].map(lambda x: resumes[x])\n", "Resumeranking = Resumeranking.drop(['resume_id'], axis=1)\n", "\n", "# Convert Adjusted_Score to percentage format\n", "Resumeranking['Original_Score'] = Resumeranking['Original_Score']\n", "Resumeranking['Adjusted_Score'] = (Resumeranking['Adjusted_Score'] * 100).round(2)\n", "Resumeranking['Adjusted_Score'] = Resumeranking['Adjusted_Score'].astype(str) + '%'\n", "\n", "Resumeranking = Resumeranking[['Resumes', 'Original_Score', 'Adjusted_Score']]\n", "Resumeranking" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DJzSu07bL7hx" }, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }