Spaces:
Runtime error
Runtime error
File size: 2,521 Bytes
b19c8bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# %%
import re
import fitz
import pandas as pd
# %%
document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf"
# %%
skip_header_offset = 1
regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))")
regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)")
# %%
document = ""
page_article = {}
pdf_page_offset = 1
with fitz.open(document_path) as doc:
for page_idx, page in enumerate(doc, pdf_page_offset):
text = page.get_text()
document += text
articles = regex_article.findall(text)
for article in articles:
page_article[article] = page_idx
len(page_article)
# %%
chapters = {}
chapter_name = "header"
splited_chapters = regex_chapters.split(document)
for chapter in splited_chapters[skip_header_offset:]:
if chapter.startswith("CAPÍTULO"):
chapter_name = chapter.replace(" \n", ": ")
else:
chapters[chapter_name] = chapter
len(chapters), chapters.keys()
# %%
minimum_article_length = 65
def format_article(article):
articles = article.lstrip('- ').split("\n \n")
formated_articles = []
for article in articles:
formated_article = article.replace("\n", "").replace("*", "").strip()
is_article_single = formated_article.startswith("El Estado")
is_article_too_short = len(formated_article) <= minimum_article_length
if is_article_too_short and not is_article_single:
continue
formated_articles.append(formated_article)
sentence = " ".join(formated_articles)
return sentence
# %%
chapter_articles = []
for chapter_name, chapter in chapters.items():
article_name = "header"
splited_articles = regex_article.split(chapter)
for article in splited_articles[skip_header_offset:]:
if regex_article.match(article):
article_name = article
continue
data = {
"chapter_name": chapter_name,
"article_page": page_article.get(article_name),
"article_name": article_name,
"article": format_article(article),
}
chapter_articles.append(data)
# %%
df_document = pd.DataFrame.from_dict(chapter_articles)
df_document["article_number"] = (
df_document['article_name']
.str.extract(r'(^\d+)', expand=False)
)
df_document["article_name"] = (
df_document['article_name']
.str.extract(r'^\d+\.- ?(.*)', expand=False)
.str.rstrip(".-")
)
df_document.head()
# %%
df_document.to_csv("data/articles.csv", index=False)
# %%
|