3 Star 1 Fork 1

李建林/qichezhijia

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
CrealerCar.py 3.38 KB
一键复制 编辑 原始数据 按行查看 历史
import requests
from bs4 import BeautifulSoup
import re
from multiprocessing import Pool
url = 'https://www.autohome.com.cn/grade/carhtml/{}.html'
def year(endurl, ent3, ent1):
endresu = []
res = requests.get(endurl)
soup = BeautifulSoup(res.text, 'html.parser')
f = open('CarInfo.txt', "a+", encoding='utf-8')
for ant0 in soup.select('#divSeries'):
for ant1 in ant0.select('.interval01'):
for ant2 in ant1.select('.interval01-list'):
for ant3 in ant2.select('li'):
resu2 = {}
resu2['车系'] = ent3.select('h4 a')[0].text.replace("·", "")
resu2['品牌'] = ent1.select('dt div a')[0].text.replace("·", "")
resu2['年款'] = ant3.select('.interval01-list-cars p a')[0].text.replace("·", "")
if ant2.select('.interval01-list-guidance div'):
resu2['价格'] = ant3.select('.interval01-list-guidance div')[0].text
else:
resu2['价格'] = '暂无报价'
print(resu2)
f.write(ent1.select('dt div a')[0].text.replace("·", "")+" "+ent3.select('h4 a')[0].text.replace("·", "")+" "+ant3.select('.interval01-list-cars p a')[0].text.replace("·", "")+"\n")
endresu.append(resu2)
f.close()
return endresu
def fn(carurl):
endresult1 = []
res = requests.get(carurl)
soup = BeautifulSoup(res.text, 'html.parser')
for ent1 in soup.select('dl'):
# print(ent1)
for ent2 in ent1.select('.rank-list-ul'):
# print(ent2)
for ent3 in ent2.select('li'):
# print(ent3)
if ent3.select('h4'):
m = re.search('<a href="//car.autohome.com.cn/price(.*)">报价', str(ent3))
if m:
ypnum = m.group(1)
endresult1.extend(year("https://car.autohome.com.cn/price" + ypnum, ent3, ent1))
return endresult1
if __name__ == '__main__':
endresult = []
# res=[]
# 多线程(快)(适用于网络密集型请求)
# for i in range(97, 123):
# p = MyThread(chr(i).upper())
# # res.append(p)
# p.start()
# 多进程(还可以)(cpu密集型,io密集型,网络密集型都适用,但相对占用CPU资源)
# for i in range(97,123):
# p=MyProcess(chr(i).upper())
# p.start()
# 进程池(慢)
p = Pool(4)
for i in range(97, 123):
newurl=url.format(chr(i).upper())
# newurl = url.format(chr(97).upper())
res = p.apply(fn, args=(newurl,))
endresult.extend(res)
# p.close()
# p.join()
print(endresult.__len__())
# newurl = url.format(chr(97).upper())
# fn(newurl)
# for x in res:
# x.join()
# endresult.extend(x.get_result())
# df = pandas.DataFrame(endresult)
# print(df)
# df.to_excel('car18.xlsx')
# f = open('CarInfo.txt', encoding='utf-8')
# for i in f:
# print(i)
# IO密集型代码(文件处理、网络爬虫等),
# 多线程能够有效提升效率(单线程下有IO操作会进行IO等待,造成不必要的时间浪费,
# 而开启多线程能在线程A等待时,自动切换到线程B,可以不浪费CPU的资源,从而能提升程序执行效率)。
# 所以python的多线程对IO密集型代码比较友好。
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/lijianlin819/qichezhijia.git
git@gitee.com:lijianlin819/qichezhijia.git
lijianlin819
qichezhijia
qichezhijia
master

搜索帮助