File size: 6,226 Bytes
8fe7f88 0049d2e 8fe7f88 0049d2e 8fe7f88 983a2d4 8fe7f88 983a2d4 8fe7f88 983a2d4 8fe7f88 0049d2e 8fe7f88 0049d2e 8fe7f88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from typing import List
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import sys
from tabulate import tabulate
import spacy
import re
load_dotenv(".env")
nlp = spacy.load("en_core_web_sm")
def split_text_recursively(text):
if '\n' not in text:
return [text]
parts = text.split('\n', 1)
return [parts[0]] + split_text_recursively(parts[1])
def tokenize_to_sent(path):
# Read the file
with open(path, 'r') as file:
text = file.read()
# Sentence tokenization
str_list = split_text_recursively(text)
str_list = [i.strip() for i in str_list]
str_list = list(filter(None, str_list))
count = 0
sents = []
for line in str_list:
doc = nlp(line)
for sent in doc.sents:
# print(f"{sent.text}")
sents.append(sent.text)
return sents
### LLM-based tag extraction with few-shot learning
model = ChatOpenAI(temperature=0)
class TokenTaggingResult(BaseModel):
tokens: List[str]
skill_labels: List[str]
knowledge_labels: List[str]
model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
parser = JsonOutputParser(pydantic_object=TokenTaggingResult)
# Definitions
skill_definition = """
Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems.
"""
knowledge_definition = """
Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study.
"""
# Few-shot examples
with open('few-shot.txt', 'r') as file:
few_shot_examples = file.read()
prompt = PromptTemplate(
template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
Skill definition:{skill_definition}
Knowledge definition:{knowledge_definition}
Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
input_variables=["input"],
partial_variables={"format_instructions": parser.get_format_instructions(),
"few_shot_examples": few_shot_examples,
"skill_definition": skill_definition,
"knowledge_definition": knowledge_definition},
)
def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
if tokenize:
inputs = tokenizer(text, return_tensors="pt")
tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
prompt_and_model = prompt | model
output = prompt_and_model.invoke({"input": tokens})
output = parser.invoke(output)
return tokens, output
### Pre-trained model from Hugging Face
mapping = {0: 'B', 1: 'I', 2: 'O'}
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
def convert(text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
skill_outputs = token_skill_classifier(**inputs)
knowledge_outputs = token_knowledge_classifier(**inputs)
decoded_tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
skill_cls = [mapping[i.item()] for i in skill_cls]
knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
if len(decoded_tokens) != len(skill_cls) or len(decoded_tokens) != len(knowledge_cls):
raise ValueError("Error: Length mismatch")
return skill_cls, knowledge_cls, decoded_tokens
from transformers import pipeline
pipe = pipeline("token-classification", model="jjzha/jobbert_knowledge_extraction")
def convert2(text):
output = pipe(text)
tokens = [i['word'] for i in output]
skill_cls = [i['entity'] for i in output]
knowledge_cls = [i['entity'] for i in output]
return skill_cls, knowledge_cls, tokens
def tag_posting(path, llm_extract = True):
# Reading & sentence tokenization
sents = tokenize_to_sent(path)
for sent in sents:
# print(f"Sent: {sent}")
skill_cls, knowledge_cls, tokens = convert(sent)
# Pre-trained
# skill_cls, knowledge_cls, _ = convert(text)
if llm_extract:
# LLM-based tag extraction
tokens, output = extract_tags(text, tokenize=True)
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
print(tabulate(table, headers=headers, tablefmt="pretty"))
else:
# Only pre-trained
table = zip(tokens, output['skill_labels'], output['knowledge_labels'])
headers = ["Token", "Skill Label", "Knowledge Label"]
print(tabulate(table, headers=headers, tablefmt="pretty"))
if __name__ == "__main__":
path = './job-postings/03-01-2024/1.txt'
tag_posting(path, llm_extract = False)
quit()
text = input('Enter text: ')
# LLM-based tag extraction
tokens, output = extract_tags(text, tokenize=True)
# Pre-trained
skill_cls, knowledge_cls = convert(text)
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
print(tabulate(table, headers=headers, tablefmt="pretty")) |