|
import spacy |
|
import re |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
def split_text_recursively(text): |
|
if '\n' not in text: |
|
return [text] |
|
parts = text.split('\n', 1) |
|
return [parts[0]] + split_text_recursively(parts[1]) |
|
|
|
def parse_post(path): |
|
|
|
|
|
|
|
with open(path, 'r') as file: |
|
text = file.read() |
|
|
|
|
|
|
|
str_list = split_text_recursively(text) |
|
str_list = [i.strip() for i in str_list] |
|
str_list = list(filter(None, str_list)) |
|
|
|
count = 0 |
|
sents = [] |
|
|
|
for line in str_list: |
|
doc = nlp(line) |
|
for sent in doc.sents: |
|
print(f"{sent.text}") |
|
sents.append(sent.text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
path = './job-postings/03-01-2024/2.txt' |
|
parse_post(path) |
|
|