File size: 6,226 Bytes
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0049d2e
 
 
8fe7f88
 
0049d2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983a2d4
 
8fe7f88
 
 
 
 
 
 
 
983a2d4
 
 
 
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983a2d4
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0049d2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe7f88
 
 
0049d2e
 
 
 
 
8fe7f88
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from typing import List
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import sys
from tabulate import tabulate
import spacy
import re

load_dotenv(".env")

nlp = spacy.load("en_core_web_sm")

def split_text_recursively(text):
    if '\n' not in text:
        return [text]
    parts = text.split('\n', 1)
    return [parts[0]] + split_text_recursively(parts[1])


def tokenize_to_sent(path):

    # Read the file

    with open(path, 'r') as file:
        text = file.read()

    # Sentence tokenization

    str_list = split_text_recursively(text)
    str_list = [i.strip() for i in str_list]
    str_list = list(filter(None, str_list))

    count = 0
    sents = []

    for line in str_list:
        doc = nlp(line)
        for sent in doc.sents:
            # print(f"{sent.text}")
            sents.append(sent.text)
    
    return sents


### LLM-based tag extraction with few-shot learning

model = ChatOpenAI(temperature=0)

class TokenTaggingResult(BaseModel):
    tokens: List[str]
    skill_labels: List[str]
    knowledge_labels: List[str]


model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
parser = JsonOutputParser(pydantic_object=TokenTaggingResult)

# Definitions

skill_definition = """
Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems.
"""

knowledge_definition = """
Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study.
"""

# Few-shot examples
with open('few-shot.txt', 'r') as file:
    few_shot_examples = file.read()

prompt = PromptTemplate(
    template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
    Skill definition:{skill_definition}
    Knowledge definition:{knowledge_definition}
    Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
    input_variables=["input"],
    partial_variables={"format_instructions": parser.get_format_instructions(),
                       "few_shot_examples": few_shot_examples,
                       "skill_definition": skill_definition,
                       "knowledge_definition": knowledge_definition},
)

def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:

    if tokenize:

        inputs = tokenizer(text, return_tensors="pt")
        tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]

    prompt_and_model = prompt | model
    output = prompt_and_model.invoke({"input": tokens})
    output = parser.invoke(output)
    return tokens, output


### Pre-trained model from Hugging Face

mapping = {0: 'B', 1: 'I', 2: 'O'}
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")

def convert(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        skill_outputs = token_skill_classifier(**inputs)
        knowledge_outputs = token_knowledge_classifier(**inputs)

    decoded_tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
    skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
    knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]

    skill_cls = [mapping[i.item()] for i in skill_cls]
    knowledge_cls = [mapping[i.item()] for i in knowledge_cls]

    if len(decoded_tokens) != len(skill_cls) or len(decoded_tokens) != len(knowledge_cls):
        raise ValueError("Error: Length mismatch")

    return skill_cls, knowledge_cls, decoded_tokens


from transformers import pipeline
pipe = pipeline("token-classification", model="jjzha/jobbert_knowledge_extraction")

def convert2(text):
    output = pipe(text)
    tokens = [i['word'] for i in output]
    skill_cls = [i['entity'] for i in output]
    knowledge_cls = [i['entity'] for i in output]

    return skill_cls, knowledge_cls, tokens
    



def tag_posting(path, llm_extract = True):

    # Reading & sentence tokenization
    sents = tokenize_to_sent(path)

    for sent in sents:
        # print(f"Sent: {sent}")
        skill_cls, knowledge_cls, tokens = convert(sent)
           

    # Pre-trained
    # skill_cls, knowledge_cls, _ = convert(text)

    if llm_extract:

        # LLM-based tag extraction
        tokens, output = extract_tags(text, tokenize=True)
        table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
        headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
        print(tabulate(table, headers=headers, tablefmt="pretty"))

    else:

        # Only pre-trained
        table = zip(tokens, output['skill_labels'], output['knowledge_labels'])
        headers = ["Token", "Skill Label", "Knowledge Label"]
        print(tabulate(table, headers=headers, tablefmt="pretty"))



if __name__ == "__main__":

    path = './job-postings/03-01-2024/1.txt'
    tag_posting(path, llm_extract = False)

    quit()
    text = input('Enter text: ')

    # LLM-based tag extraction
    tokens, output = extract_tags(text, tokenize=True)

    # Pre-trained
    skill_cls, knowledge_cls = convert(text)

    table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
    headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
    print(tabulate(table, headers=headers, tablefmt="pretty"))