1 Star 0 Fork 1

云计算和大数据_东南大学/Cosine Robot

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
codeRecom.py 5.87 KB
一键复制 编辑 原始数据 按行查看 历史
qiyuehan 提交于 2021-12-20 15:32 . 20211220
# -*- coding:utf-8 -*-
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from CodeRecom1.BOKEYUAN import searchBoKeYuan
from CodeRecom1.FIVE_ONE import searchfiveone
# 模板数量
model_number = 4
# 模拟搜索CSDN
def searchCSDN(q):
count = 1
# 返回 code 之后,对 code 进行封装的头部和尾部
html_header = '<!DOCTYPE html><html lang="zh-CN"><head><meta charset="utf-8"><meta http-equiv="content-type" content="text/html; charset=utf-8"><meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no"><link rel="stylesheet" type="text/css" href="https://csdnimg.cn/release/blogv2/dist/pc/css/detail_enter-ace8742f72.min.css"><link rel="stylesheet" type="text/css" href="https://csdnimg.cn/release/blogv2/dist/pc/themesSkin/skin-blackboard/skin-blackboard-3adcc2c475.min.css"><link rel="stylesheet" type="text/css" href="https://csdnimg.cn/public/sandalstrap/1.4/css/sandalstrap.min.css"></head><body class="nodata " style=""><link rel="stylesheet" href="https://csdnimg.cn/release/blogv2/dist/pc/css/blog_code-01256533b5.min.css"><link rel="stylesheet" href="https://csdnimg.cn/release/blogv2/dist/mdeditor/css/editerView/chart-3456820cac.css" />'
html_end = '</body></html>'
# CSDN 搜索栏的统一资源路径地址
url = 'https://so.csdn.net/api/v3/search'
# 请求包的头部
headers = {
'Cookie': 'c_first_ref=default; c_first_page=https://passport.csdn.net/login?code=public;c_segment=3;c_page_id=default;Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1612319443;UN=weixin_41401116;BT=1612319480283;p_uid=U010000;Hm_up_6bcd52f51e9b3dce32bec4a3997715ac={"islogin":{"value":"1","scope":1},"isonline":{"value":"0","scope":1},"isvip":{"value":"0","scope":1},"uid_":{"value":"weixin_41401116","scope":1}}; announcement-new={"isLogin":false,"announcementUrl":"https://blog.csdn.net/blogdevteam/article/details/112280974?utm_source=gonggao_0107","announcementCount":0};',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept': 'application/json,text/plain,*/*',
'accept-encoding': 'gzip,deflate,br',
'accept-language': 'zh-CN,zh;q =0.9',
}
# 请求参数
param = {
'q': q,
't': 'all',
'p': '1',
's': '0',
'tm': '0',
'lv': '-1',
'ft': '0',
'l': '',
'u': '',
'ct': '-1',
'pnt': '-1',
'ry': '-1',
'ss': '-1',
'dct': '-1',
'platform': 'pc'
}
# 发送请求之后得到的响应, page_text 得到搜索得到的多个目标地址 {},{},...
response = requests.get(url=url, params=param, headers=headers)
page_text = response.text
# 将响应转换为 JSON 格式的数据
json_href = response.json()
str_code_list = []
url_list = [] # 博客的地址
counter = 0
for dic in json_href['result_vos']:
# 搜索之后得到的目标 blog 链接
blog_url = dic['url']
if blog_url[0:3] != 'htt':
continue
# print(blog_url)
blog_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
# 根据拿到的博客的地址,对其进行请求访问,得到响应的博客页面 blog_text
blog_response = requests.get(url=blog_url, headers=blog_headers)
blog_text = blog_response.text
# 使用 bs4 对博客页面进行代码抽取,得到<class 'bs4.element.ResultSet'>类型的代码列表 code_list,转换为 列表类型的 blog_code_list
soup = BeautifulSoup(blog_text, 'lxml')
code_list = soup.find_all("code")
blog_code_list = []
# str_code = '<h1>==============' + q + ' 代码模板 ==============</h1>'
str_code = ''
for code in code_list:
# 如果爬取到的页面中没有代码页面,那么我们就跳过该页面
if code == '':
continue
else:
# 爬取到的页面中有代码界面 code ,那么我们就要对其进行一个数据处理,并将其展示在前端界面
# 该 code 有很多个 code_tag 组成
for code_tag in code:
blog_code_list.append(code_tag)
for code in blog_code_list:
str_code = str_code + code.__str__()
str_code = str_code + ''
# 更换文本文件为 html 下的换行符 <br>
str_code = str_code.replace('\n', '<br>')
str_code = str_code.replace(' ', '&nbsp;')
str_code_list.append(str_code)
counter = counter + 1
url_list.append(blog_url) # 当前页面有代码,将当前页面地址加入到url列表中
if counter == model_number:
url_list.append(blog_url)
return str_code_list,url_list
# fileName = './test'+count.__str__()+'.html'
# count = count + 1
# with open(fileName,'w',encoding='utf-8') as fp:
# fp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /></head><body>'+str_code+'</body></html>')
return str_code_list,url_list # 这里是列表的形式
# 模拟搜索博客园
def searchBOKEYUAN(q):
(str_code_list,url_list) = searchBoKeYuan(q,model_number)
return str_code_list,url_list
# 模拟搜索 51CTO
def search51CTO(q):
(str_code_list,url_list) = searchfiveone(q,model_number)
return str_code_list,url_list
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/cabseu/cosine-robot.git
git@gitee.com:cabseu/cosine-robot.git
cabseu
cosine-robot
Cosine Robot
master

搜索帮助