1 Star 0 Fork 0

cxl-starkchen/cook_assistant

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
dataCrawler.py 3.31 KB
一键复制 编辑 原始数据 按行查看 历史
cxl-starkchen 提交于 2021-11-09 13:39 . original code
from bs4 import BeautifulSoup
from urllib.request import urlopen
from dataDefine import *
import re
import numpy as np
def updateMenu():#WEB, NAME
dic = {}
cnt = 0
base_url = 'https://www.meishij.net/chufang/diy/jiangchangcaipu/?&page='
for pages in range(1,56+1):
html = urlopen(base_url+str(pages)).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
web = soup.find_all('a', 'big')
# print(web)
web = [str(i) for i in web]
weblist, namelist = [], []
re_link = re.compile(r'(http://www.*.html)')
re_name = re.compile(r'title="(.*)"')
for item in web:
match_l = re_link.search(item)
match_n = re_name.search(item)
if match_l:
weblist.append(match_l.group(1))
else:
print('fail1'.center(20, '#'))
print(item)
if match_n:
namelist.append(match_n.group(1))
else:
match_again = re.search(r'title="【添喜的厨房】 (\w*)"', item)
if match_again:
namelist.append(match_again.group(1))
else:
cnt += 1
print(('fail'+str(cnt)).center(20, '#'))
print(item)
namelist.append('Error')
lenth = max(len(weblist),len(namelist))
for i in range(0, lenth):
dic[namelist[i]] = weblist[i]
return dic
def updateDetail(address):#raw material
html = urlopen(address).read().decode('utf-8')
soup = BeautifulSoup(html,features='lxml')
body = soup.find_all('li')
re_material = re.compile(r'target="_blank">(\w*)</a><span>(.*)</span></h4><a')
re_material2 = re.compile(r'target="_blank">(\w*)</a></h4><span>(.*)</span><a')
re_material3 = re.compile(r'javascript:;">(\w*)</a></h4><span>(\w*)</span><a')
body = [str(l) for l in body]
data = DData()
for item in body:
match_mater = re_material.search(item)
match_mater2 = re_material2.search(item)
match_mater3 = re_material3.search(item)
if match_mater:
data.rMaterial[match_mater.group(1)]=match_mater.group(2)
elif match_mater2:
data.rMaterial[match_mater2.group(1)]=match_mater2.group(2)
elif match_mater3:
data.rMaterial[match_mater3.group(1)]=match_mater3.group(2)
else:
pass
# print('Error')
# data.rMaterial['!']='Error'
return data.rMaterial
def updateSteps(addr):#steps
html = urlopen(addr).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
# print(soup.body)
match_n = re.search(r'"step":"(\w*)步",',str(soup.body))
total = -1
if match_n:
total = match_n.group(1)
else :
print('step not found'.center(20,'!'))
body = soup.find_all('div','c')
re_steps = re.compile(r'<p>(.*)</p>\s+')
body = [str(i) for i in body]
data = []
for step in body:
match_s = re_steps.search(step)
if match_s:
s = match_s.group(1)
if '<img alt=' in s:
s = re.search(r'src="(.*)"/>',s).group(1)
data.append(s)
else:
pass
# print('Not Find'.center(20,'!'))
# print(step)
return data,total
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/cxl-starkchen/cook_assistant.git
git@gitee.com:cxl-starkchen/cook_assistant.git
cxl-starkchen
cook_assistant
cook_assistant
main

搜索帮助