代码拉取完成,页面将自动刷新
# 处理word文档
import docx
import os
import re
from PIL import Image
from functools import reduce
import math
from newf.getarea import calculate_article_base
class Article:
"""构造Article类
属性:title,paragraphs,count_pics,
have_pics,count_subtitle"""
def __init__(self,title,paras=[],count_pics=0):
self.title = title
self.sub_title = []
self.paragraphs = paras
self.count_pics = count_pics
self.have_subtitle = False
self.count_sub_title = 0
self.count_title_char = 0
self.count_paras_char = 0
self.count_subtitle_char = 0
self.count_paras = 0
self.article_spaces = []
self.pic =[]
if count_pics > 0:
self.have_pics = True
else:
self.have_pics = False
def get_picture(self,pic_size):
if self.have_pics:
self.pic = pic_size[0:self.count_pics]
pic_size[0:self.count_pics] = []
def set_count(self):
"""设置标题和正文字数"""
self.para_char=[]
self.count_title_char = len(self.title)
self.count_ftitle_char = len(self.sub_title[0])
self.count_ytitle_char = len(self.sub_title[1])
if self.have_subtitle:
self.count_subtitle_char = len(reduce(lambda x,y: x+y,self.sub_title))
else:
self.count_subtitle_char = 0
count = 0
for para in self.paragraphs:
numcount=0
for i in range(10):
numcount+=para.count(str(i))
count += len(para)
self.para_char.append(len(para)-numcount//2-1)
self.count_paras_char = count-numcount//2-1
def set_default_size(self):
"""选取默认样式,计算默认大小"""
pt = 0.35
line_hight = 1.3*10*pt
char_width = 10*pt
width_series = [29.9,40,70,100]
for width in width_series:
width_ = width - 10/3.2#文本块与边框之间的距离为5mm
pic_space = 0
for pic in self.pic:
pic_space += 1600*pic[1] // pic[0]#默认图片宽度为40
self.pics_space = pic_space#图片
maintitle_rows = math.ceil(len(self.title)*40*pt/(width_*3.2))
subtitle_rows = math.ceil(len(self.sub_title[0])*20*pt/(width_*3.2)) + math.ceil(len(self.sub_title[1])*20*pt/(width_*3.2))
title_hight = maintitle_rows*40*pt + subtitle_rows*20*pt#标题高度
text_rows = 0
column_count = math.floor((width+10)/20)#栏数
for para in self.paragraphs:
text_rows += math.ceil((len(para)+2)*char_width/17/3.2 )
text_rows = text_rows // column_count
# print(f'ddd----{self.title}-------------')
# print(maintitle_rows)
# print(subtitle_rows)
# print(text_rows)
text_hight = text_rows*line_hight
self.title_space = title_hight * width // 5
self.text_space = width * text_hight // 5
self.article_space = self.title_space + self.text_space + self.pics_space
self.article_spaces.append(self.article_space)
def get_title_and_para(docx_path):
"""获取word文本标题以及段落内容
返回字典(index:article)"""
result_dict = dict()
count = 0
file = docx.Document(docx_path)
for para in file.paragraphs:#遍历每一段内容,为title和paragraphs,以及图片注释
if para.text:
if para.style.name.startswith('Heading 1'):#用1级标题表示新闻大标题
count += 1
atcl = Article(para.text,[])
result_dict[count] = atcl
else:
if para.style.name.startswith('Heading 2'):#用2级标题表示副标题
atcl.sub_title.append(para.text)
atcl.have_subtitle = True
atcl.count_sub_title += 1
elif para.text.startswith(f'图片{count}'):
atcl.have_pics = True
atcl.count_pics += 1
else:
atcl.paragraphs.append(para.text)
return result_dict
def get_pictures(word_path, result_path,imglist):
"""从word中获得图片,存储在目标路径
图片命名为文件名_图片名(image+n)"""
doc = docx.Document(word_path)
dict_rel = doc.part._rels
if not os.path.exists(result_path):
os.makedirs(result_path)
else:
for file in os.listdir(result_path):
os.remove(result_path+'/'+file)
for rel in dict_rel:
rel = dict_rel[rel]
if "image" in rel.target_ref:#检查word是否含有图片
img_name = re.findall("/(.*)", rel.target_ref)[0]
word_name = os.path.splitext(word_path)[0]
if os.sep in word_name:
new_name = word_name.split('/')[-1]
else:
new_name = word_name.split('/')[-1]
img_name = f'{new_name}_{img_name}'#命名图片
with open(f'{result_path}/{img_name}', "wb") as f:
f.write(rel.target_part.blob)
img_paths = ['./img_article/'+name for name in os.listdir('./img_article')]
result = []
for img_path in img_paths:
img = Image.open(img_path)
imgSize = img.size #大小/尺寸
w = img.width #图片的宽
h = img.height #图片的高
result.append((w,h,img_path))
img.close()#关闭图片
'''
for img_path in img_paths:#重命名
suffix = imglist.pop(0)
new = img_path.split('.')[-1]
os.rename(img_path,'./img_article/'+new_name+suffix+'.'+new)
'''
return result
def prn_obj(obj):
"""打印对象属性"""
print ('\n'.join(['%s:%s' % item for item in obj.__dict__.items()]) )
def get_article_dic(filename:str):
result = get_title_and_para(filename)
########
a = [0]
for i in range(len(result)):
a.append(result[i+1].count_pics)
s = list(a)
imglist = []
for i in range(len(s)):
j = 1
while s[i] > 0:
imglist.append(f'_{i}_{j}_')
s[i] -= 1
j += 1
########
pic_size = get_pictures(filename,'./img_article',imglist)
for n in result:
result[n].set_count()
result[n].get_picture(pic_size)
###
# prn_obj(result[n])
###
for i in range(2):
if 'None' in result[n].sub_title[i]:
result[n].sub_title[i] = ''
result[n].count_sub_title -= 1
if result[n].sub_title[0]=='' and result[n].sub_title[1]=='':
result[n].have_subtitle=False
# result[n].set_default_size()
# print('frfrfrfrffffffffffffffffff',result[n].para)
result[n].count_paras = len(result[n].paragraphs)
result[n].count_ftitle_char = len(result[n].sub_title[0])
result[n].count_ytitle_char = len(result[n].sub_title[1])
return result
if __name__ == '__main__':
#E:\vscodeproject\test1\python_tex\template_initiate\testdocx\test.docx
result = get_title_and_para('./testdocx/jh2.docx')
##########
a = [0]
for i in range(len(result)):
a.append(result[i+1].count_pics)
s = list(a)
imglist = []
for i in range(len(s)):
j = 1
while s[i] > 0:
imglist.append(f'_{i}_{j}_')
s[i] -= 1
j += 1
##########
pic_size = get_pictures('./testdocx/jh2.docx','./img_article',imglist)
print(pic_size)
print(len(result))
for n in result:
result[n].set_count()
result[n].get_picture(pic_size)
for i in range(2):
if 'None' in result[n].sub_title[i]:
result[n].sub_title[i] = ''
result[n].count_sub_title -= 1
if result[n].sub_title[0]=='' and result[n].sub_title[1]=='':
result[n].have_subtitle=False
# result[n].set_default_size()
result[n].count_paras = len(result[n].paragraphs)
result[n].count_ftitle_char = len(result[n].sub_title[0])
result[n].count_ytitle_char = len(result[n].sub_title[1])
prn_obj(result[n])
print('----------------------------------')
# ###将所有一类的数据放到一个数组中##############################
# #新闻信息列表
# title=[]
# paragraphs=[]
# count_pics=[]
# have_subtitle=[]
# count_sub_title=[]
# sub_title=[]
# pic=[]
# have_pics=[]
# para_char=[]
# count_title_char=[]
# for n in result:
# print(n)
# #prn_obj(result[n])
# title.append(result[n].title)
# paragraphs.append(result[n].paragraphs)
# count_pics.append(result[n].count_pics)
# have_subtitle.append(result[n].have_subtitle)
# count_sub_title.append(result[n].count_sub_title)
# sub_title.append(result[n].sub_title)
# pic.append(result[n].pic)
# have_pics.append(result[n].have_pics)
# para_char.append(result[n].para_char)
# count_title_char.append(result[n].count_title_char)
# print(title)
# print(paragraphs)
# print(count_pics)
# print(have_subtitle)
# print(count_sub_title)
# print(sub_title)
# print(pic)
# print(have_pics)
# print(para_char)
# print(count_pics)
# #############
# print(result[1].article_spaces)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。