st / app.py
lijk20's picture
Upload 2 files
c9e13b4
import io
import docx
import configparser
import pandas as pd
import asyncio
from docx import Document
from docxtpl import DocxTemplate
from docx.shared import Pt
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx.enum.dml import MSO_THEME_COLOR_INDEX
from docx.enum.style import WD_STYLE_TYPE
from docx.shared import Cm, Inches
from docx.oxml.shared import OxmlElement
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docx.shared import RGBColor
from docx.enum.text import WD_COLOR_INDEX
from requests_toolbelt import MultipartEncoder
from datetime import datetime, timedelta
def count_values(df, col_name):
value_counts = df[col_name].value_counts()
result_df = pd.DataFrame(value_counts)
result_df.columns = ['count']
result_df.reset_index(inplace=True)
result_df.rename(columns={'index': col_name}, inplace=True)
return result_df
def add_hyperlink(paragraph, url, text):
"""
A function that places a hyperlink within a paragraph object.
:param paragraph: The paragraph we are adding the hyperlink to.
:param url: A string containing the required url
:param text: The text displayed for the url
:return: A Run object containing the hyperlink
"""
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, RT.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = OxmlElement('w:hyperlink')
hyperlink.set(qn('r:id'), r_id, )
hyperlink.set(qn('w:history'), '1')
# Create a w:r element
new_run = OxmlElement('w:r')
# Create a new w:rPr element
rPr = OxmlElement('w:rPr')
# Create a w:rStyle element, note this currently does not add the hyperlink style as its not in
# the default template, I have left it here in case someone uses one that has the style in it
rStyle = OxmlElement('w:rStyle')
rStyle.set(qn('w:val'), 'Hyperlink')
# Join all the xml elements together add add the required text to the w:r element
rPr.append(rStyle)
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
# Create a new Run object and add the hyperlink into it
r = paragraph.add_run()
r._r.append(hyperlink)
# A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
# Delete this if using a template that has the hyperlink style in it
r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
r.font.underline = True
return r
def create_table(document,count_df1):
table = document.add_table(rows=2, cols=2)
# 设置表格宽度
table.columns[0].width = docx.shared.Inches(3.7)
table.columns[1].width = docx.shared.Inches(3.7)
# 设置表格边框
table.style = 'Table Grid'
# 设置表格第一行内容
table.rows[0].height = docx.shared.Pt(9)
first_row_cells = table.rows[0].cells
first_row_cells[0].text = "技术进展"
first_row_cells[0].paragraphs[0].alignment = docx.enum.text.WD_PARAGRAPH_ALIGNMENT.CENTER
first_row_cells[1].text = "业内动态"
first_row_cells[1].paragraphs[0].alignment = docx.enum.text.WD_PARAGRAPH_ALIGNMENT.CENTER
# 设置第一行字体
font = first_row_cells[0].paragraphs[0].runs[0].font
font.name = "思源黑体 Regular"
first_row_cells[0].paragraphs[0].runs[0]._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
font.size = docx.shared.Pt(8)
font.bold = True
font = first_row_cells[1].paragraphs[0].runs[0].font
font.name = "思源黑体 Regular"
first_row_cells[1].paragraphs[0].runs[0]._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
font.size = docx.shared.Pt(8)
font.bold = True
# 设置表格第二行内容
second_row_cells = table.rows[1].cells
second_row_cells[0].text = '''\t图像理解与生成 \t{0}项\n\t计算光学 \t{1}项\n\t图像处理 \t{2}项\n\t机器学习前沿 \t{3}项\n\t自然语言交互 \t{4}项\n\t量子计算 \t{5}项\n\t计算机视觉前沿 \t{6}项'''.format(count_df1[0],count_df1[1],count_df1[2],
count_df1[3],count_df1[4],count_df1[5],count_df1[6])
second_row_cells[0].paragraphs[0].alignment = docx.enum.text.WD_PARAGRAPH_ALIGNMENT.LEFT
second_row_cells[1].text = "\t大厂动态 \t{0}项\n".format(count_df1[7])
second_row_cells[1].paragraphs[0].alignment = docx.enum.text.WD_PARAGRAPH_ALIGNMENT.LEFT
# 设置第二行字体
font = second_row_cells[0].paragraphs[0].runs[0].font
font.name = "思源黑体 Regular"
second_row_cells[0].paragraphs[0].runs[0]._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
font.size = docx.shared.Pt(8)
font = second_row_cells[1].paragraphs[0].runs[0].font
font.name = "思源黑体 Regular"
second_row_cells[1].paragraphs[0].runs[0]._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
font.size = docx.shared.Pt(8)
# 设置行高
table.rows[0].height = docx.shared.Pt(9)
def 荣耀周报排版(xlsx,template):
document = Document(template)
df = pd.read_excel(xlsx)
res = df.sort_values(by='领域', ascending=True)
count_df = count_values(df, '领域')
count_df1 = count_df.sort_values(by='领域', ascending=True)["count"]
count_df1 = list(count_df1)
sections = ["图像理解与生成", "计算光学", "图像处理", "机器学习前沿", "自然语言交互", "计算机视觉前沿","量子计算", "定向追踪"]
# 开头标注时间 思源黑体 Regular 四号
try:
date_style = document.styles['date_range']
date_style.font.name = "思源黑体 Regular"
date_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
date_style.font.bold = True
date_style.font.size = Pt(14)
except:
date_style = document.styles.add_style('date_range', 1)
date_style.font.name = "思源黑体 Regular"
date_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
date_style.font.bold = True
date_style.font.size = Pt(14)
# 设置标题样式 思源黑体 Bold 三号
try:
title_style = document.styles['title2']
title_style.font.name = "思源黑体 Bold"
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
title_style.font.bold = True
title_style.font.size = Pt(16)
except:
title_style = document.styles.add_style('title2',1)
title_style.font.name = "思源黑体 Bold"
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
title_style.font.bold = True
title_style.font.size = Pt(16)
try:
title_style = document.styles['title']
title_style.base_style = document.styles['Heading 1']
title_style.font.name = "思源黑体 Bold"
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
title_style.font.color.rgb=RGBColor(0,0,0)
title_style.font.bold = True
title_style.font.size = Pt(16)
except:
title_style = document.styles.add_style('title',1)
title_style.base_style = document.styles['Heading 1']
title_style.font.name = "思源黑体 Bold"
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
title_style.font.color.rgb=RGBColor(0,0,0)
title_style.font.bold = True
title_style.font.size = Pt(16)
# 热点速览技术进展小标题 思源黑体 小五 下划线
try:
tech_style = document.styles['tech_progress']
tech_style.font.name = "思源黑体 Regular"
tech_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
tech_style.font.bold = False
tech_style.font.size = Pt(9)
tech_style.font.underline = True
except:
tech_style = document.styles.add_style('tech_progress', 1)
tech_style.font.name = "思源黑体 Regular"
tech_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
tech_style.font.bold = False
tech_style.font.size = Pt(9)
tech_style.font.underline = True
# 热点速览注释与详情 思源黑体 小五
try:
cont_style = document.styles['content']
cont_style.font.name = "思源黑体 Regular"
cont_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
cont_style.font.bold = False
cont_style.font.size = Pt(9)
cont_style.font.color.rgb=RGBColor(89,89,89)
except:
cont_style = document.styles.add_style('content', 1)
cont_style.font.name = "思源黑体 Regular"
cont_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
cont_style.font.bold = False
cont_style.font.size = Pt(9)
cont_style.font.color.rgb=RGBColor(89,89,89)
# 思源黑体 小四 --部分正文--段落
try:
part1_style = document.styles['weekly_summary']
part1_style.font.name = "思源黑体 Regular"
part1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
part1_style.font.bold = False
part1_style.font.size = Pt(12)
except:
part1_style = document.styles.add_style('weekly_summary', 1)
part1_style.font.name = "思源黑体 Regular"
part1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
part1_style.font.bold = False
part1_style.font.size = Pt(12)
# 思源黑体 小四 --部分正文--字符
try:
part2_style = document.styles['inside_para']
part2_style.font.name = "思源黑体 Regular"
part2_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
part2_style.font.bold = False
part2_style.font.size = Pt(12)
except:
part2_style = document.styles.add_style('inside_para', 2)
part2_style.font.name = "思源黑体 Regular"
part2_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
part2_style.font.bold = False
part2_style.font.size = Pt(12)
# 思源黑体 Regular 11号字--热点正文--段落
try:
part3_style = document.styles['part3_style']
part3_style.font.name = "思源黑体 Regular"
part3_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
part3_style.font.bold = False
part3_style.font.size = Pt(11)
except:
part3_style = document.styles.add_style('part3_style', 1)
part3_style.font.name = "思源黑体 Regular"
part3_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
part3_style.font.bold = False
part3_style.font.size = Pt(11)
# 定向追踪-技术进展 思源黑体 Bold 四号
try:
tech1_style = document.styles['tech']
tech1_style.font.name = "思源黑体 Bold"
tech1_style.base_style = document.styles['Heading 1']
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
tech1_style.font.color.rgb=RGBColor(0,0,0)
tech1_style.font.bold = True
tech1_style.font.size = Pt(14)
except:
tech1_style = document.styles.add_style('tech',1)
tech1_style.font.name = "思源黑体 Bold"
tech1_style.base_style = document.styles['Heading 1']
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
tech1_style.font.color.rgb=RGBColor(0,0,0)
tech1_style.font.bold = True
tech1_style.font.size = Pt(14)
try:
tech1_style = document.styles['tech2']
tech1_style.font.name = "思源黑体 Bold"
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
tech1_style.font.bold = True
tech1_style.font.size = Pt(14)
except:
tech1_style = document.styles.add_style('tech2',1)
tech1_style.font.name = "思源黑体 Bold"
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
tech1_style.font.bold = True
tech1_style.font.size = Pt(14)
# 定向追踪-技术进展 思源黑体 Regular 小四号,背景灰色-25%
try:
tech1_style = document.styles['tech1']
tech1_style.font.name = "思源黑体 Regular"
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
tech1_style.font.bold = True
tech1_style.font.size = Pt(12)
tech1_style.font.highlight_color=WD_COLOR_INDEX.GRAY_25
except:
tech1_style = document.styles.add_style('tech1',1)
tech1_style.font.name = "思源黑体 Regular"
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
tech1_style.font.bold = True
tech1_style.font.size = Pt(12)
tech1_style.font.highlight_color=WD_COLOR_INDEX.GRAY_25
# 思源黑体 Bold 小四 --定向追踪标题时间--段落
try:
part4_style = document.styles['title_date']
part4_style.font.name = "思源黑体 Bold"
part4_style._element.rPr.rFonts.set(qn('w:eastAsia'),"思源黑体 Bold")
part4_style.font.bold = False
part4_style.font.size = Pt(12)
except:
part4_style = document.styles.add_style('title_date', 1)
part4_style.font.name = "思源黑体 Bold"
part4_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Bold")
part4_style.font.bold = False
part4_style.font.size = Pt(12)
# 思源黑体 Light 10 --定向追踪技术--段落
try:
part4_style = document.styles['tech_detail']
part4_style.font.name = "思源黑体 Light"
part4_style._element.rPr.rFonts.set(qn('w:eastAsia'),"思源黑体 Light")
part4_style.font.bold = False
part4_style.font.size = Pt(10)
except:
part4_style = document.styles.add_style('tech_detail', 1)
part4_style.font.name = "思源黑体 Light"
part4_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Light")
part4_style.font.bold = False
part4_style.font.size = Pt(10)
# 定向追踪-专家点评 思源黑体 Regular 小四号
try:
tech1_style = document.styles['expert']
tech1_style.font.name = "思源黑体 Regular"
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
tech1_style.font.bold = True
tech1_style.font.size = Pt(12)
except:
tech1_style = document.styles.add_style('expert',1)
tech1_style.font.name = "思源黑体 Regular"
tech1_style._element.rPr.rFonts.set(qn('w:eastAsia'), "思源黑体 Regular")
tech1_style.font.bold = True
tech1_style.font.size = Pt(12)
# 设置标题
titles = ['一、本期目录', '二、热点速览', '三、定向追踪']
t1 = "2023 年 x 月 x 日 —— 2023 年 x 月 x 日"
t2 = "【本期荣耀周报内容概览】"
para1 = document.add_paragraph(t1)
para1.style = document.styles["date_range"]
run = para1.add_run(" ")
run.style = document.styles["inside_para"]
run = para1.add_run(t2)
run.style = document.styles["inside_para"]
run.font.bold = False
document.add_paragraph("",style = "weekly_summary")
#一、本期目录
document.add_paragraph(titles[0], style='title')
document.add_paragraph("", style='title2')
# 二、热点速览
document.add_paragraph(titles[1], style='title')
document.add_paragraph("", style='weekly_summary')
document.add_paragraph("【本周期热点总结】", style='weekly_summary')
document.add_paragraph("", style='weekly_summary')
document.add_paragraph("以下为本周期热点速览,以事件发生时间排序。", style='weekly_summary')
# 添加段落
document.add_paragraph("", style='tech_progress')
document.add_paragraph("技术进展 · 【领域】 · 【涉及技术】", style='tech_progress')
document.add_paragraph("【技术进展正文】", style='part3_style')
document.add_paragraph("热点注释:", style='content')
document.add_paragraph("查看详情:", style='content')
document.add_paragraph("", style='content')
# 三、定向追踪
document.add_paragraph(titles[2], style='title')
document.add_paragraph("", style='title2')
for section in sections:
section2 = section
if section != "定向追踪":
text1 = "技术进展 · "+section2
document.add_paragraph(text1, style='tech')
document.add_paragraph("", style='tech2')
document.add_paragraph("进展聚焦", style='tech1')
document.add_paragraph("", style='tech1')
document.add_paragraph("【占位】", style='weekly_summary')
document.add_paragraph("", style='tech1')
document.add_paragraph("进展详情", style='tech1')
# 表格创建
num = int(count_df[count_df["领域"].str.contains(section)]["count"])
table = document.add_table(rows=num, cols=1)
# table.style = 'Table Grid'
# 表格填充
res1 = res[res["领域"].str.contains(section)].sort_values(by = "时间",ascending=False)
for i, row in enumerate(table.rows):
for cell in row.cells:
cell.text = ""
cell.paragraphs[0].style = "title_date"
old_format = '%Y.%m.%d'
new_format = '%Y-%m-%d'
# 将日期字符串转换为 datetime 对象
date_str = res1.iloc[i]["时间"]
date_obj = datetime.strptime(date_str, old_format)
# 将 datetime 对象转换为新的日期格式字符串
new_date_str = datetime.strftime(date_obj, new_format)
text1 = str(new_date_str )+ " | "+ str(res1.iloc[i]["标题"])
cell.add_paragraph(text1)
cell.paragraphs[1].style = "title_date"
text2 = "· "+ str(res1.iloc[i]["涉及技术"])
cell.add_paragraph(text2)
cell.paragraphs[2].style="tech_detail"
text3 = str(res1.iloc[i]["简述(摘要)"])
cell.add_paragraph(text3)
cell.paragraphs[3].style = "part3_style"
add_hyperlink(cell.paragraphs[3], res1.iloc[i]["源链接"], "原文链接")
text4 = ""
cell.add_paragraph(text4)
cell.paragraphs[4].style = "part3_style"
if res1.iloc[i]["是否点评"] == "是":
text5 = "专家点评"
cell.add_paragraph(text5)
cell.paragraphs[5].style = "expert"
text6 = ""
cell.add_paragraph(text6)
cell.paragraphs[5].style = "expert"
document.add_page_break()
elif section == "定向追踪":
text1 = "业内动态 · "+"产品发布"
document.add_paragraph(text1, style='tech')
document.add_paragraph("", style='tech2')
document.add_paragraph("进展聚焦", style='tech1')
document.add_paragraph("", style='tech1')
document.add_paragraph("【占位】", style='weekly_summary')
document.add_paragraph("", style='tech1')
document.add_paragraph("进展详情", style='tech1')
# 表格创建
table = document.add_table(rows=1, cols=1)
# table.style = 'Table Grid'
# 表格填充
for row in table.rows:
for cell in row.cells:
cell.text = ""
cell.paragraphs[0].style = "title_date"
# 将日期字符串转换为 datetime 对象
text1 = "yyyy-mm-dd"+ " | "+ "【标题占位】"
cell.add_paragraph(text1)
cell.paragraphs[1].style = "title_date"
text2 = "· "+ "【涉及技术】"
cell.add_paragraph(text2)
cell.paragraphs[2].style="tech_detail"
text3 = "【简述(摘要)占位】"
cell.add_paragraph(text3)
cell.paragraphs[3].style = "part3_style"
add_hyperlink(cell.paragraphs[3], "【】", "【原文链接占位】")
text4 = ""
cell.add_paragraph(text4)
cell.paragraphs[4].style = "part3_style"
text5 = "【专家点评占位】"
cell.add_paragraph(text5)
cell.paragraphs[5].style = "expert"
text6 = ""
cell.add_paragraph(text6)
cell.paragraphs[5].style = "expert"
document.add_page_break()
text1 = "业内动态 · "+"大厂动态"
document.add_paragraph(text1, style='tech')
document.add_paragraph("", style='tech2')
document.add_paragraph("进展聚焦", style='tech1')
document.add_paragraph("", style='tech1')
document.add_paragraph("【占位】", style='weekly_summary')
document.add_paragraph("", style='tech1')
document.add_paragraph("进展详情", style='tech1')
# 表格创建
table = document.add_table(rows=1, cols=1)
# table.style = 'Table Grid'
# 表格填充
for row in table.rows:
for cell in row.cells:
cell.text = ""
cell.paragraphs[0].style = "title_date"
# 将日期字符串转换为 datetime 对象
text1 = "yyyy-mm-dd"+ " | "+ "【标题占位】"
cell.add_paragraph(text1)
cell.paragraphs[1].style = "title_date"
text2 = "· "+ "【涉及技术】"
cell.add_paragraph(text2)
cell.paragraphs[2].style="tech_detail"
text3 = "【简述(摘要)占位】"
cell.add_paragraph(text3)
cell.paragraphs[3].style = "part3_style"
add_hyperlink(cell.paragraphs[3], "【】", "【原文链接占位】")
text4 = ""
cell.add_paragraph(text4)
cell.paragraphs[4].style = "part3_style"
text5 = "【专家点评占位】"
cell.add_paragraph(text5)
cell.paragraphs[5].style = "expert"
text6 = ""
cell.add_paragraph(text6)
cell.paragraphs[5].style = "expert"
document.add_page_break()
text1 = "业内动态 · "+"项目开源"
document.add_paragraph(text1, style='tech')
document.add_paragraph("", style='tech2')
document.add_paragraph("进展聚焦", style='tech1')
document.add_paragraph("", style='tech1')
document.add_paragraph("【占位】", style='weekly_summary')
document.add_paragraph("", style='tech1')
document.add_paragraph("进展详情", style='tech1')
# 表格创建
table = document.add_table(rows=1, cols=1)
# table.style = 'Table Grid'
# 表格填充
for row in table.rows:
for cell in row.cells:
cell.text = ""
cell.paragraphs[0].style = "title_date"
# 将日期字符串转换为 datetime 对象
text1 = "yyyy-mm-dd"+ " | "+ "【标题占位】"
cell.add_paragraph(text1)
cell.paragraphs[1].style = "title_date"
text2 = "· "+ "【涉及技术】"
cell.add_paragraph(text2)
cell.paragraphs[2].style="tech_detail"
text3 = "【简述(摘要)占位】"
cell.add_paragraph(text3)
cell.paragraphs[3].style = "part3_style"
add_hyperlink(cell.paragraphs[3], "【】", "【原文链接占位】")
text4 = ""
cell.add_paragraph(text4)
cell.paragraphs[4].style = "part3_style"
text5 = "【专家点评占位】"
cell.add_paragraph(text5)
cell.paragraphs[5].style = "expert"
text6 = ""
cell.add_paragraph(text6)
cell.paragraphs[5].style = "expert"
return document
import pandas as pd
import docx
# Gradio 部分
import gradio as gr
import streamlit as st
from io import BytesIO
# def excel_to_docx(xlsx):
# # 处理 Excel 文件并生成 docx 文件
# document,name = 荣耀周报排版(xlsx)
# return document.getvalue()
# 定义 Gradio 的输入和输出界面
# inputs = gr.inputs.File(label="Excel 文件", type=["file"])
# outputs = gr.outputs.File(label="docx 文件")
st.title('Translator App')
st.markdown("Translate from Docx file")
st.subheader("File Upload")
datas=st.file_uploader("Original File")
template=st.file_uploader("template File")
name=st.text_input('Enter New File Name: ')
stream = BytesIO()
if st.button(label='生成'):
st.spinner('Waiting...')
document= 荣耀周报排版(datas,template)
out = document.save(stream)
st.success("Translated")
st.download_button(label='Download Translated File',file_name=(f"{name}.docx"), data=stream.getvalue())