|
|
|
|
|
|
|
from openpyxl.cell import cell |
|
from openpyxl.styles import Font, Alignment, PatternFill, NamedStyle, Border, Side, Color |
|
from openpyxl.styles.fills import FILL_SOLID |
|
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE |
|
from openpyxl.styles.colors import BLACK |
|
|
|
FORMAT_DATE_MMDDYYYY = 'mm/dd/yyyy' |
|
|
|
|
|
def colormap(color): |
|
""" |
|
Convenience for looking up known colors |
|
""" |
|
cmap = {'black': BLACK} |
|
return cmap.get(color, color) |
|
|
|
|
|
def style_string_to_dict(style): |
|
""" |
|
Convert css style string to a python dictionary |
|
""" |
|
def clean_split(string, delim): |
|
return (s.strip() for s in string.split(delim)) |
|
styles = [clean_split(s, ":") for s in style.split(";") if ":" in s] |
|
return dict(styles) |
|
|
|
|
|
def get_side(style, name): |
|
return {'border_style': style.get('border-{}-style'.format(name)), |
|
'color': colormap(style.get('border-{}-color'.format(name)))} |
|
|
|
known_styles = {} |
|
|
|
|
|
def style_dict_to_named_style(style_dict, number_format=None): |
|
""" |
|
Change css style (stored in a python dictionary) to openpyxl NamedStyle |
|
""" |
|
|
|
style_and_format_string = str({ |
|
'style_dict': style_dict, |
|
'parent': style_dict.parent, |
|
'number_format': number_format, |
|
}) |
|
|
|
if style_and_format_string not in known_styles: |
|
|
|
font = Font(bold=style_dict.get('font-weight') == 'bold', |
|
color=style_dict.get_color('color', None), |
|
size=style_dict.get('font-size')) |
|
|
|
|
|
alignment = Alignment(horizontal=style_dict.get('text-align', 'general'), |
|
vertical=style_dict.get('vertical-align'), |
|
wrap_text=style_dict.get('white-space', 'nowrap') == 'normal') |
|
|
|
|
|
bg_color = style_dict.get_color('background-color') |
|
fg_color = style_dict.get_color('foreground-color', Color()) |
|
fill_type = style_dict.get('fill-type') |
|
if bg_color and bg_color != 'transparent': |
|
fill = PatternFill(fill_type=fill_type or FILL_SOLID, |
|
start_color=bg_color, |
|
end_color=fg_color) |
|
else: |
|
fill = PatternFill() |
|
|
|
|
|
border = Border(left=Side(**get_side(style_dict, 'left')), |
|
right=Side(**get_side(style_dict, 'right')), |
|
top=Side(**get_side(style_dict, 'top')), |
|
bottom=Side(**get_side(style_dict, 'bottom')), |
|
diagonal=Side(**get_side(style_dict, 'diagonal')), |
|
diagonal_direction=None, |
|
outline=Side(**get_side(style_dict, 'outline')), |
|
vertical=None, |
|
horizontal=None) |
|
|
|
name = 'Style {}'.format(len(known_styles) + 1) |
|
|
|
pyxl_style = NamedStyle(name=name, font=font, fill=fill, alignment=alignment, border=border, |
|
number_format=number_format) |
|
|
|
known_styles[style_and_format_string] = pyxl_style |
|
|
|
return known_styles[style_and_format_string] |
|
|
|
|
|
class StyleDict(dict): |
|
""" |
|
It's like a dictionary, but it looks for items in the parent dictionary |
|
""" |
|
def __init__(self, *args, **kwargs): |
|
self.parent = kwargs.pop('parent', None) |
|
super(StyleDict, self).__init__(*args, **kwargs) |
|
|
|
def __getitem__(self, item): |
|
if item in self: |
|
return super(StyleDict, self).__getitem__(item) |
|
elif self.parent: |
|
return self.parent[item] |
|
else: |
|
raise KeyError('{} not found'.format(item)) |
|
|
|
def __hash__(self): |
|
return hash(tuple([(k, self.get(k)) for k in self._keys()])) |
|
|
|
|
|
|
|
|
|
def _keys(self): |
|
yielded = set() |
|
for k in self.keys(): |
|
yielded.add(k) |
|
yield k |
|
if self.parent: |
|
for k in self.parent._keys(): |
|
if k not in yielded: |
|
yielded.add(k) |
|
yield k |
|
|
|
def get(self, k, d=None): |
|
try: |
|
return self[k] |
|
except KeyError: |
|
return d |
|
|
|
def get_color(self, k, d=None): |
|
""" |
|
Strip leading # off colors if necessary |
|
""" |
|
color = self.get(k, d) |
|
if hasattr(color, 'startswith') and color.startswith('#'): |
|
color = color[1:] |
|
if len(color) == 3: |
|
color = ''.join(2 * c for c in color) |
|
return color |
|
|
|
|
|
class Element(object): |
|
""" |
|
Our base class for representing an html element along with a cascading style. |
|
The element is created along with a parent so that the StyleDict that we store |
|
can point to the parent's StyleDict. |
|
""" |
|
def __init__(self, element, parent=None): |
|
self.element = element |
|
self.number_format = None |
|
parent_style = parent.style_dict if parent else None |
|
self.style_dict = StyleDict(style_string_to_dict(element.get('style', '')), parent=parent_style) |
|
self._style_cache = None |
|
|
|
def style(self): |
|
""" |
|
Turn the css styles for this element into an openpyxl NamedStyle. |
|
""" |
|
if not self._style_cache: |
|
self._style_cache = style_dict_to_named_style(self.style_dict, number_format=self.number_format) |
|
return self._style_cache |
|
|
|
def get_dimension(self, dimension_key): |
|
""" |
|
Extracts the dimension from the style dict of the Element and returns it as a float. |
|
""" |
|
dimension = self.style_dict.get(dimension_key) |
|
if dimension: |
|
if dimension[-2:] in ['px', 'em', 'pt', 'in', 'cm']: |
|
dimension = dimension[:-2] |
|
dimension = float(dimension) |
|
return dimension |
|
|
|
|
|
class Table(Element): |
|
""" |
|
The concrete implementations of Elements are semantically named for the types of elements we are interested in. |
|
This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to |
|
allowing Element to have an arbitrary number of children and dealing with an abstract element tree. |
|
""" |
|
def __init__(self, table): |
|
""" |
|
takes an html table object (from lxml) |
|
""" |
|
super(Table, self).__init__(table) |
|
table_head = table.find('thead') |
|
self.head = TableHead(table_head, parent=self) if table_head is not None else None |
|
table_body = table.find('tbody') |
|
self.body = TableBody(table_body if table_body is not None else table, parent=self) |
|
|
|
|
|
class TableHead(Element): |
|
""" |
|
This class maps to the `<th>` element of the html table. |
|
""" |
|
def __init__(self, head, parent=None): |
|
super(TableHead, self).__init__(head, parent=parent) |
|
self.rows = [TableRow(tr, parent=self) for tr in head.findall('tr')] |
|
|
|
|
|
class TableBody(Element): |
|
""" |
|
This class maps to the `<tbody>` element of the html table. |
|
""" |
|
def __init__(self, body, parent=None): |
|
super(TableBody, self).__init__(body, parent=parent) |
|
self.rows = [TableRow(tr, parent=self) for tr in body.findall('tr')] |
|
|
|
|
|
class TableRow(Element): |
|
""" |
|
This class maps to the `<tr>` element of the html table. |
|
""" |
|
def __init__(self, tr, parent=None): |
|
super(TableRow, self).__init__(tr, parent=parent) |
|
self.cells = [TableCell(cell, parent=self) for cell in tr.findall('th') + tr.findall('td')] |
|
|
|
|
|
def element_to_string(el): |
|
return _element_to_string(el).strip() |
|
|
|
|
|
def _element_to_string(el): |
|
string = '' |
|
|
|
for x in el.iterchildren(): |
|
string += '\n' + _element_to_string(x) |
|
|
|
text = el.text.strip() if el.text else '' |
|
tail = el.tail.strip() if el.tail else '' |
|
|
|
return text + string + '\n' + tail |
|
|
|
|
|
class TableCell(Element): |
|
""" |
|
This class maps to the `<td>` element of the html table. |
|
""" |
|
CELL_TYPES = {'TYPE_STRING', 'TYPE_FORMULA', 'TYPE_NUMERIC', 'TYPE_BOOL', 'TYPE_CURRENCY', 'TYPE_PERCENTAGE', |
|
'TYPE_NULL', 'TYPE_INLINE', 'TYPE_ERROR', 'TYPE_FORMULA_CACHE_STRING', 'TYPE_INTEGER'} |
|
|
|
def __init__(self, cell, parent=None): |
|
super(TableCell, self).__init__(cell, parent=parent) |
|
self.value = element_to_string(cell) |
|
self.number_format = self.get_number_format() |
|
|
|
def data_type(self): |
|
cell_types = self.CELL_TYPES & set(self.element.get('class', '').split()) |
|
if cell_types: |
|
if 'TYPE_FORMULA' in cell_types: |
|
|
|
cell_type = 'TYPE_FORMULA' |
|
elif cell_types & {'TYPE_CURRENCY', 'TYPE_INTEGER', 'TYPE_PERCENTAGE'}: |
|
cell_type = 'TYPE_NUMERIC' |
|
else: |
|
cell_type = cell_types.pop() |
|
else: |
|
cell_type = 'TYPE_STRING' |
|
return getattr(cell, cell_type) |
|
|
|
def get_number_format(self): |
|
if 'TYPE_CURRENCY' in self.element.get('class', '').split(): |
|
return FORMAT_CURRENCY_USD_SIMPLE |
|
if 'TYPE_INTEGER' in self.element.get('class', '').split(): |
|
return '#,##0' |
|
if 'TYPE_PERCENTAGE' in self.element.get('class', '').split(): |
|
return FORMAT_PERCENTAGE |
|
if 'TYPE_DATE' in self.element.get('class', '').split(): |
|
return FORMAT_DATE_MMDDYYYY |
|
if self.data_type() == cell.TYPE_NUMERIC: |
|
try: |
|
int(self.value) |
|
except ValueError: |
|
return '#,##0.##' |
|
else: |
|
return '#,##0' |
|
|
|
def format(self, cell): |
|
cell.style = self.style() |
|
data_type = self.data_type() |
|
if data_type: |
|
cell.data_type = data_type |