2 Star 5 Fork 14

ayuliao/AntiCrawlers

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
智能解析.py 12.29 KB
一键复制 编辑 原始数据 按行查看 历史
二两的分身 提交于 2021-08-05 14:17 . enjoy code
import re
from difflib import SequenceMatcher
import requests
from lxml.html import HtmlElement, fromstring
import numpy as np
url = 'https://culture.ifeng.com/c/88H83I7afKE'
r = requests.get(url)
html = r.content
# 整个网页对应的 HtmlElement 对象,其根节点就是 html
element = fromstring(html=html)
'''
匹配时间
'''
# 利用正则进行匹配
REGEXES = [
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{4}年\d{1,2}月\d{1,2}日)",
"(\d{2}年\d{1,2}月\d{1,2}日)",
"(\d{1,2}月\d{1,2}日)"
]
# 按正则提取文中时间
def extract_by_regex(element: HtmlElement) -> str:
text = ''.join(element.xpath('.//text()'))
for regex in REGEXES:
result = re.search(regex, text)
if result:
return result.group(1)
'''
提取标题
'''
# 通过XPath从meta节点中提取标题
METASTitle = [
'//meta[starts-with(@property, "og:title")]/@content',
'//meta[starts-with(@name, "og:title")]/@content',
'//meta[starts-with(@property, "title")]/@content',
'//meta[starts-with(@name, "title")]/@content',
'//meta[starts-with(@property, "page:title")]/@content',
]
def extract_by_meta(element: HtmlElement) -> str:
for xpath in METASTitle:
title = element.xpath(xpath)
if title:
return ''.join(title)
# 通过 XPath 提取 title 标签
def extract_by_title(element: HtmlElement):
return ''.join(element.xpath('//title//text()')).strip()
# 通过 XPath 提取 h 节点
def extract_by_h(element: HtmlElement):
return ''.join(
element.xpath('(//h1//text() | //h2//text() | //h3//text())')).strip()
# 最长连续公共子串(Longest Common String)
def lcs(a, b):
match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
return a[match[0]: match[0] + match[2]]
def extract_title(element: HtmlElement):
title_extracted_by_meta = extract_by_meta(element)
title_extracted_by_h = extract_by_h(element)
title_extracted_by_title = extract_by_title(element)
return lcs(lcs(title_extracted_by_title, title_extracted_by_h), title_extracted_by_meta)
# if title_extracted_by_meta:
# return title_extracted_by_meta
# if title_extracted_by_title and title_extracted_by_h:
# return lcs(title_extracted_by_title, title_extracted_by_h)
# if title_extracted_by_title:
# return title_extracted_by_title
# return title_extracted_by_h
'''
提取正文
'''
from lxml import etree
# 无用节点
CONTENT_USELESS_TAGS = ['meta', 'style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 'path',
'symbol', 'img']
# 节点标签可删除,但需要保留内容
CONTENT_STRIP_TAGS = ['span', 'blockquote']
# 噪声节点,通常不可能是正文
CONTENT_NOISE_XPATHS = [
'//div[contains(@class, "comment")]', # 评论
'//div[contains(@class, "advertisement")]', # 广告
'//div[contains(@class, "advert")]', # 广告
'//div[contains(@style, "display: none")]', # 不可见元素
]
def remove_element(element: HtmlElement):
"""删除当前节点"""
# 获取父节点
parent = element.getparent()
# 父节点不为空,才可删除,避免将根节点(html节点)删除了
if parent is not None:
parent.remove(element)
def remove_children(element: HtmlElement, xpaths=None):
"""删除当前节点下所有的孩子节点"""
if not xpaths:
return
for xpath in xpaths:
nodes = element.xpath(xpath)
# 将节点下的所有子节点都删除
for node in nodes:
remove_element(node)
return element
def children(element: HtmlElement):
"""递归获得某节点下所有的子节点"""
yield element
for child_element in element:
if isinstance(child_element, HtmlElement):
yield from children(child_element)
def clear_element(element: HtmlElement):
"""
预处理,将HTML清理干净,方便使用文本密度算法去处理
:param element:
:return:
"""
# 将无用节点的整个节点都删除
etree.strip_elements(element, *CONTENT_USELESS_TAGS)
# 删除标签,但不会删除标签内的内容
etree.strip_tags(element, *CONTENT_STRIP_TAGS)
# 删除噪音节点
remove_children(element, CONTENT_NOISE_XPATHS)
# 遍历当前节点的子节点
for child in children(element):
# 如果是p标签,则删除p标签下的span标签和strong标签
if child.tag.lower() == 'p':
# strip_tags方法可以将子树下对应的标签删除,无论多深
etree.strip_tags(child, 'span')
etree.strip_tags(child, 'strong')
# 如果p标签没有内容,也删除
if not (child.text and child.text.strip()):
remove_element(child)
# 如果div标签没有子标签且有内容,则将其转为p
if child.tag.lower() == 'div' and not child.getchildren() and child.text:
child.tag = 'p'
class ElementInfo:
# 节点的唯一 id
id: int = None
# 节点的标签值,如 p、div、img 等
tag_name: str = None
# 节点对应的 HtmlElement 对象
element: HtmlElement = None
# 节点的总字符数
number_of_char: int = 0
# 节点带超链接的字符数
number_of_linked_char: int = 0
# 节点的标签数
number_of_tag: int = 0
# 节点的带链接的标签数,即 a 的标签数
number_of_linked_tag: int = 0
# 节点的 p 标签数
number_of_p_tag: int = 0
# 节点包含的标点符号数
number_of_punctuation: int = 0
# 节点的符号密度
density_of_punctuation: int = 1
# 节点的文本密度
density_of_text: int = 0
# 最终评分
density_score: int = 0
def number_of_char(element: HtmlElement):
"""
字符数
:param element:
:return:
"""
text = ''.join(element.xpath('.//text()'))
text = re.sub(r'\s*', '', text, flags=re.S)
return len(text)
def number_of_linked_char(element: HtmlElement):
"""
a标签中的字符数
:param element:
:return:
"""
text = ''.join(element.xpath('.//a//text()'))
text = re.sub(r'\s*', '', text, flags=re.S)
return len(text)
def number_of_tag(element: HtmlElement):
"""
标签数
:param element:
:return:
"""
return len(element.xpath('.//*'))
def number_of_p_tag(element: HtmlElement):
"""
p标签数
:param element:
:return:
"""
return len(element.xpath('.//p'))
def number_of_linked_tag(element: HtmlElement):
"""a标签数"""
return len(element.xpath('.//a'))
def density_of_text(element_info: ElementInfo):
"""
计算 文本密度
"""
a = element_info.number_of_char - element_info.number_of_linked_char
b = element_info.number_of_tag - element_info.number_of_linked_tag
# 分母为0,则返回0
if b == 0:
return 0
return a / b
def density_of_punctuation(element_info: ElementInfo):
"""
计算 符号密度
:param element_info:
:return:
"""
a = element_info.number_of_char - element_info.number_of_linked_char
b = element_info.number_of_punctuation + 1
result = a / b
# 结果不可为0,因为最后的目标公式中,会对其取log运算,而log(0)是不存在的
return result or 1
PUNCTUATION = set('''!,。?、;:“”‘’《》%()<>{}「」【】*~`,.?:;'"!%()''')
def number_of_punctuation(element: HtmlElement):
text = ''.join(element.xpath('.//text()'))
text = re.sub(r'\s*', '', text, flags=re.S)
# 匹配出字符串中的符号
punctuations = [c for c in text if c in PUNCTUATION]
# 符号长度
return len(punctuations)
def children_of_body(element: HtmlElement):
# 获取最初 element 节点的所有子节点,然后对节点进行处理
body_xpath = '//body'
elements = element.xpath(body_xpath)
if elements:
# 递归获得所有子节点
return children(elements[0])
return []
def fill_element_info(element_info: ElementInfo):
# 填充了 element_info 的几乎所有指标信息
element = element_info.element
element_info.id = hash(element)
element_info.tag_name = element.tag
element_info.number_of_char = number_of_char(element)
element_info.number_of_linked_char = number_of_linked_char(element)
element_info.number_of_tag = number_of_tag(element)
element_info.number_of_linked_tag = number_of_linked_tag(element)
element_info.number_of_p_tag = number_of_p_tag(element)
element_info.number_of_punctuation = number_of_punctuation(element)
element_info.density_of_text = density_of_text(element_info)
element_info.density_of_punctuation = density_of_punctuation(element_info)
return element_info
def extract_content(element):
element_infos = []
# 清理html
clear_element(element)
# 获得 body 节点下的所有的子节点,因为正文通常在body节点下,对于head之类的节点,不处理
child_elements = children_of_body(element)
for child_element in child_elements:
# 实例化
element_info = ElementInfo()
# 当前节点下的子节点
element_info.element = child_element
# 获得各种信息,如字符数、标签数、文本密度等
element_info = fill_element_info(element_info)
element_infos.append(element_info)
# 计算文本密度(density_of_text)的标准差
density_of_text = [element_info.density_of_text for element_info in element_infos]
density_of_text_std = np.std(density_of_text, ddof=1)
# 计算最终的密度得分
for element_info in element_infos:
score = np.log(density_of_text_std) * \
element_info.density_of_text * \
np.log10(element_info.number_of_p_tag + 2) * \
np.log(element_info.density_of_punctuation)
element_info.density_score = score
# 按密度得分排序
element_infos = sorted(element_infos, key=lambda x: x.density_score, reverse=True)
element_info_first = element_infos[0] if element_infos else None
# 提取其中的文本内容
text = '\n'.join(element_info_first.element.xpath('.//p//text()'))
return text
def main():
# 提取文章时间
article_time = extract_by_regex(element)
# 提取文章title
title = extract_title(element)
text = extract_content(element)
data = {
'time': article_time,
'title': title,
'text': text
}
print(data)
if __name__ == '__main__':
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/ayuLiao/anti-crawlers.git
git@gitee.com:ayuLiao/anti-crawlers.git
ayuLiao
anti-crawlers
AntiCrawlers
master

搜索帮助