Spaces:
Runtime error
Runtime error
# %% | |
import re | |
import fitz | |
import pandas as pd | |
# %% | |
document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf" | |
# %% | |
skip_header_offset = 1 | |
regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))") | |
regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)") | |
# %% | |
document = "" | |
page_article = {} | |
pdf_page_offset = 1 | |
with fitz.open(document_path) as doc: | |
for page_idx, page in enumerate(doc, pdf_page_offset): | |
text = page.get_text() | |
document += text | |
articles = regex_article.findall(text) | |
for article in articles: | |
page_article[article] = page_idx | |
len(page_article) | |
# %% | |
chapters = {} | |
chapter_name = "header" | |
splited_chapters = regex_chapters.split(document) | |
for chapter in splited_chapters[skip_header_offset:]: | |
if chapter.startswith("CAPÍTULO"): | |
chapter_name = chapter.replace(" \n", ": ") | |
else: | |
chapters[chapter_name] = chapter | |
len(chapters), chapters.keys() | |
# %% | |
minimum_article_length = 65 | |
def format_article(article): | |
articles = article.lstrip('- ').split("\n \n") | |
formated_articles = [] | |
for article in articles: | |
formated_article = article.replace("\n", "").replace("*", "").strip() | |
is_article_single = formated_article.startswith("El Estado") | |
is_article_too_short = len(formated_article) <= minimum_article_length | |
if is_article_too_short and not is_article_single: | |
continue | |
formated_articles.append(formated_article) | |
sentence = " ".join(formated_articles) | |
return sentence | |
# %% | |
chapter_articles = [] | |
for chapter_name, chapter in chapters.items(): | |
article_name = "header" | |
splited_articles = regex_article.split(chapter) | |
for article in splited_articles[skip_header_offset:]: | |
if regex_article.match(article): | |
article_name = article | |
continue | |
data = { | |
"chapter_name": chapter_name, | |
"article_page": page_article.get(article_name), | |
"article_name": article_name, | |
"article": format_article(article), | |
} | |
chapter_articles.append(data) | |
# %% | |
df_document = pd.DataFrame.from_dict(chapter_articles) | |
df_document["article_number"] = ( | |
df_document['article_name'] | |
.str.extract(r'(^\d+)', expand=False) | |
) | |
df_document["article_name"] = ( | |
df_document['article_name'] | |
.str.extract(r'^\d+\.- ?(.*)', expand=False) | |
.str.rstrip(".-") | |
) | |
df_document.head() | |
# %% | |
df_document.to_csv("data/articles.csv", index=False) | |
# %% | |