1 Star 1 Fork 0

萧石/car_qa

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
scarpy_ask_emao.py 1.81 KB
一键复制 编辑 原始数据 按行查看 历史
萧石 提交于 2024-08-07 15:41 . 变更
#一猫汽车问答爬取
#网站:http://ask.emao.com/question/1.html
#author:萧石子 1037654919@qq.com
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import _thread
import re
import sqlalchemy
url = 'http://ask.emao.com/question/1.html'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
def get_data():
datay = []
for i in range(11): #临时长度:11089
if((i+1)%100 ==0):
print('休息10秒')
time.sleep(10)
url = 'http://ask.emao.com/question/{}.html'.format(i)
try:
print(url)
html = requests.get(url, headers=HEADERS).text
s = requests.session()
s.close()
soup = BeautifulSoup(html, 'lxml')
ques = soup.find_all('div', class_="mian-le")[0].find_all('div', class_='list-title')[0]
qu = ques.find_all('dd')[0].find_all('p')[0].text.strip()
tit_info = ques.find_all('p', class_='tit-info')[0].text.strip()
answers = soup.find_all('div', class_='answer-cnt')[0].find_all('div', class_='listInfo')[0].find_all('dl')
list = []
for answer in answers:
info = answer.find_all("div", class_='infoLt')[0].find('p').text
list.append(info)
datay.append([qu,tit_info,list])
except:continue
#保存到本地
#pd.DataFrame(datay, columns=['question', 'question_info', 'Answers']).to_excel('ask_emao_0422.xlsx')
return datay
if __name__ == '__main__':
print()
# get_data()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/beihai_xiaoshi/car_qa.git
git@gitee.com:beihai_xiaoshi/car_qa.git
beihai_xiaoshi
car_qa
car_qa
master

搜索帮助