代码拉取完成,页面将自动刷新
#一猫汽车问答爬取
#网站:http://ask.emao.com/question/1.html
#author:萧石子 1037654919@qq.com
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import _thread
import re
import sqlalchemy
url = 'http://ask.emao.com/question/1.html'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
def get_data():
datay = []
for i in range(11): #临时长度:11089
if((i+1)%100 ==0):
print('休息10秒')
time.sleep(10)
url = 'http://ask.emao.com/question/{}.html'.format(i)
try:
print(url)
html = requests.get(url, headers=HEADERS).text
s = requests.session()
s.close()
soup = BeautifulSoup(html, 'lxml')
ques = soup.find_all('div', class_="mian-le")[0].find_all('div', class_='list-title')[0]
qu = ques.find_all('dd')[0].find_all('p')[0].text.strip()
tit_info = ques.find_all('p', class_='tit-info')[0].text.strip()
answers = soup.find_all('div', class_='answer-cnt')[0].find_all('div', class_='listInfo')[0].find_all('dl')
list = []
for answer in answers:
info = answer.find_all("div", class_='infoLt')[0].find('p').text
list.append(info)
datay.append([qu,tit_info,list])
except:continue
#保存到本地
#pd.DataFrame(datay, columns=['question', 'question_info', 'Answers']).to_excel('ask_emao_0422.xlsx')
return datay
if __name__ == '__main__':
print()
# get_data()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。