代码拉取完成,页面将自动刷新
同步操作将从 独一无二/CAJ_PDF_Word转换工具 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
import struct
class HNParsePage(object):
def __init__(self, data, old_style=False):
self.data = data
self.data_length = len(data)
self.characters = []
self.figures = []
self.stats = {}
self.offset = 0
def Text(self, code):
try:
self.characters.append(bytes([self.data[self.offset + 5], self.data[self.offset + 4]]).decode("gbk"))
except IndexError: # short data, nothing to do
pass
except UnicodeDecodeError:
# HTL: When cut-and-paste on Linux, these transform to GB18030,
# but I believe they are OCR artifacts. Where they occur,
# 0xA38D 0xA38a (always together) are line-breaks, and 0xA389, 0xA3A0
# are tabs and spaces.
hash = {
0xA389: "\t",
0xA38a: "\n",
0xA38D: "\r",
0xA3A0: " ",
# # GB18030
# 0xA389 : "",
# 0xA38a : "",
# 0xA38D : "",
# 0xA3A0 : "",
}
code = self.data[self.offset + 5] * 256 + self.data[self.offset + 4]
try:
# self.characters.append("<0x%04X>\n" % code)
self.characters.append(hash[code])
except KeyError:
self.characters.append("<0x%04X>\n" % code)
self.offset += 6
def TextMulti(self, code):
self.offset += 2
if (code == 0x8001):
self.characters.append("\n")
while (1):
try:
if (self.data[self.offset + 1] == 0x80):
break
except IndexError: # short data, nothing to do
return
try:
self.characters.append(
bytes([self.data[self.offset + 3], self.data[self.offset + 2]]).decode("gbk"))
except UnicodeDecodeError:
self.characters.append(
"<0x%04X>\n" % (self.data[self.offset + 3] * 256 + self.data[self.offset + 2]))
except IndexError: # short data, nothing to do
return
self.offset += 4
def Figure(self, code):
try:
self.data[self.offset + 25]
except IndexError: # short data, nothing to do
return
(ignore1, offset_x, offset_y, size_x, size_y, int2, int3, int4, int5) = struct.unpack("<HHHHHIIII",
self.data[
self.offset:self.offset + 26])
# in units of 1/2.473 pixels
self.figures.append([offset_x, offset_y, size_x, size_y])
self.offset += 26
if (not old_style):
dispatch = {
0x8001: Text,
0x800A: Figure,
}
else:
dispatch = {
0x8001: TextMulti,
0x8070: TextMulti,
0x800A: Figure,
}
dispatch_keys = dispatch.keys()
while (self.offset <= self.data_length - 2):
(dispatch_code,) = struct.unpack("H", self.data[self.offset:self.offset + 2])
self.offset += 2
if (dispatch_code in dispatch_keys):
dispatch[dispatch_code](self, dispatch_code)
else:
self.offset += 2
if (dispatch_code in self.stats.keys()):
self.stats[dispatch_code] += 1
else:
self.stats[dispatch_code] = 1
@property
def texts(self):
text = ''.join(self.characters)
text.replace('\x00', '')
text.replace('\r', '')
return text
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。