|
|
|
from __future__ import absolute_import |
|
|
|
from lxml import html |
|
from openpyxl import Workbook |
|
from openpyxl.utils import get_column_letter |
|
from premailer import Premailer |
|
from tablepyxl.style import Table |
|
|
|
|
|
def string_to_int(s): |
|
if s.isdigit(): |
|
return int(s) |
|
return 0 |
|
|
|
|
|
def get_Tables(doc): |
|
tree = html.fromstring(doc) |
|
comments = tree.xpath('//comment()') |
|
for comment in comments: |
|
comment.drop_tag() |
|
return [Table(table) for table in tree.xpath('//table')] |
|
|
|
|
|
def write_rows(worksheet, elem, row, column=1): |
|
""" |
|
Writes every tr child element of elem to a row in the worksheet |
|
returns the next row after all rows are written |
|
""" |
|
from openpyxl.cell.cell import MergedCell |
|
|
|
initial_column = column |
|
for table_row in elem.rows: |
|
for table_cell in table_row.cells: |
|
cell = worksheet.cell(row=row, column=column) |
|
while isinstance(cell, MergedCell): |
|
column += 1 |
|
cell = worksheet.cell(row=row, column=column) |
|
|
|
colspan = string_to_int(table_cell.element.get("colspan", "1")) |
|
rowspan = string_to_int(table_cell.element.get("rowspan", "1")) |
|
if rowspan > 1 or colspan > 1: |
|
worksheet.merge_cells(start_row=row, start_column=column, |
|
end_row=row + rowspan - 1, end_column=column + colspan - 1) |
|
|
|
cell.value = table_cell.value |
|
table_cell.format(cell) |
|
min_width = table_cell.get_dimension('min-width') |
|
max_width = table_cell.get_dimension('max-width') |
|
|
|
if colspan == 1: |
|
|
|
|
|
|
|
|
|
width = max(worksheet.column_dimensions[get_column_letter(column)].width or 0, len(table_cell.value) + 2) |
|
if max_width and width > max_width: |
|
width = max_width |
|
elif min_width and width < min_width: |
|
width = min_width |
|
worksheet.column_dimensions[get_column_letter(column)].width = width |
|
column += colspan |
|
row += 1 |
|
column = initial_column |
|
return row |
|
|
|
|
|
def table_to_sheet(table, wb): |
|
""" |
|
Takes a table and workbook and writes the table to a new sheet. |
|
The sheet title will be the same as the table attribute name. |
|
""" |
|
ws = wb.create_sheet(title=table.element.get('name')) |
|
insert_table(table, ws, 1, 1) |
|
|
|
|
|
def document_to_workbook(doc, wb=None, base_url=None): |
|
""" |
|
Takes a string representation of an html document and writes one sheet for |
|
every table in the document. |
|
The workbook is returned |
|
""" |
|
if not wb: |
|
wb = Workbook() |
|
wb.remove(wb.active) |
|
|
|
inline_styles_doc = Premailer(doc, base_url=base_url, remove_classes=False).transform() |
|
tables = get_Tables(inline_styles_doc) |
|
|
|
for table in tables: |
|
table_to_sheet(table, wb) |
|
|
|
return wb |
|
|
|
|
|
def document_to_xl(doc, filename, base_url=None): |
|
""" |
|
Takes a string representation of an html document and writes one sheet for |
|
every table in the document. The workbook is written out to a file called filename |
|
""" |
|
wb = document_to_workbook(doc, base_url=base_url) |
|
wb.save(filename) |
|
|
|
|
|
def insert_table(table, worksheet, column, row): |
|
if table.head: |
|
row = write_rows(worksheet, table.head, row, column) |
|
if table.body: |
|
row = write_rows(worksheet, table.body, row, column) |
|
|
|
|
|
def insert_table_at_cell(table, cell): |
|
""" |
|
Inserts a table at the location of an openpyxl Cell object. |
|
""" |
|
ws = cell.parent |
|
column, row = cell.column, cell.row |
|
insert_table(table, ws, column, row) |