Spaces:

Robzy
/

jobbert_knowledge_extraction

Running

jobbert_knowledge_extraction / tag-posting.py

changes

0049d2e about 2 months ago

785 Bytes

	import spacy
	import re

	nlp = spacy.load("en_core_web_sm")

	def split_text_recursively(text):
	if '\n' not in text:
	return [text]
	parts = text.split('\n', 1)
	return [parts[0]] + split_text_recursively(parts[1])

	def parse_post(path):

	# Read the file

	with open(path, 'r') as file:
	text = file.read()

	# Sentence tokenization

	str_list = split_text_recursively(text)
	str_list = [i.strip() for i in str_list]
	str_list = list(filter(None, str_list))

	count = 0
	sents = []

	for line in str_list:
	doc = nlp(line)
	for sent in doc.sents:
	print(f"{sent.text}")
	sents.append(sent.text)

	# Skill/knowledge extraction




	path = './job-postings/03-01-2024/2.txt'
	parse_post(path)