borrador_constitucion_chile / pdf_to_text.py
palegre
Add application file beta.
b19c8bc
# %%
import re
import fitz
import pandas as pd
# %%
document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf"
# %%
skip_header_offset = 1
regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))")
regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)")
# %%
document = ""
page_article = {}
pdf_page_offset = 1
with fitz.open(document_path) as doc:
for page_idx, page in enumerate(doc, pdf_page_offset):
text = page.get_text()
document += text
articles = regex_article.findall(text)
for article in articles:
page_article[article] = page_idx
len(page_article)
# %%
chapters = {}
chapter_name = "header"
splited_chapters = regex_chapters.split(document)
for chapter in splited_chapters[skip_header_offset:]:
if chapter.startswith("CAPÍTULO"):
chapter_name = chapter.replace(" \n", ": ")
else:
chapters[chapter_name] = chapter
len(chapters), chapters.keys()
# %%
minimum_article_length = 65
def format_article(article):
articles = article.lstrip('- ').split("\n \n")
formated_articles = []
for article in articles:
formated_article = article.replace("\n", "").replace("*", "").strip()
is_article_single = formated_article.startswith("El Estado")
is_article_too_short = len(formated_article) <= minimum_article_length
if is_article_too_short and not is_article_single:
continue
formated_articles.append(formated_article)
sentence = " ".join(formated_articles)
return sentence
# %%
chapter_articles = []
for chapter_name, chapter in chapters.items():
article_name = "header"
splited_articles = regex_article.split(chapter)
for article in splited_articles[skip_header_offset:]:
if regex_article.match(article):
article_name = article
continue
data = {
"chapter_name": chapter_name,
"article_page": page_article.get(article_name),
"article_name": article_name,
"article": format_article(article),
}
chapter_articles.append(data)
# %%
df_document = pd.DataFrame.from_dict(chapter_articles)
df_document["article_number"] = (
df_document['article_name']
.str.extract(r'(^\d+)', expand=False)
)
df_document["article_name"] = (
df_document['article_name']
.str.extract(r'^\d+\.- ?(.*)', expand=False)
.str.rstrip(".-")
)
df_document.head()
# %%
df_document.to_csv("data/articles.csv", index=False)
# %%