1 Star 0 Fork 0

javasqlbug/pythonProject

Create your Gitee Account
Explore and code with more than 12 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Please pay attention to the specific project description and its upstream code dependency when using it.
Clone or Download
CrawlerXingzhengquhuaTianjinFileLinuxTest.py 2.41 KB
Copy Edit Raw Blame History
niezhili authored 2020-09-16 09:31 . Initial commit
# -*- coding: UTF-8 -*-
#参考自http://c.biancheng.net/view/2011.html
import requests #导入requests包
from bs4 import BeautifulSoup
import re
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
file_name = './xingzhengquhua.txt'
url='https://xingzhengquhua.51240.com/120100000000__xingzhengquhua/'
strhtml=requests.get(url)
soup=BeautifulSoup(strhtml.text,'lxml')
data = soup.select('#main_content > table > tr > td > table > tr')
print(data)
for item in data[3:]:
time.sleep(1)
result={
'title': re.findall('\D+', item.get_text()),
'ID': re.findall('\d+', item.get_text())
}
print(result)
with open(file_name, 'a') as file_obj:
#file_obj.write(str(result.get('ID')) + ',' + str(result.get('title')) + '\n')
#file_obj.write(",".join(result) + '\n')
file_obj.write(str(result['ID'][0]) + ',' + str(result['title'][0]) + ',,4' + '\n')
#file_obj.write('\r\n')
url = 'https://xingzhengquhua.51240.com/' + str(re.findall('\d+', item.get_text())[0]) + '__xingzhengquhua/'
print(url)
strhtml = requests.get(url)
soup = BeautifulSoup(strhtml.text, 'lxml')
data = soup.select('#main_content > table > tr > td > table > tr')
print(data[3:])
for item in data[3:]:
time.sleep(1)
result = {
'title': re.findall('\D+', item.get_text()),
'ID': re.findall('\d+', item.get_text())
}
print(result)
with open(file_name, 'a') as file_obj:
file_obj.write(str(result['ID'][0]) + ',' + str(result['title'][0]) + ',,5' + '\n')
# 区级
url = 'https://xingzhengquhua.51240.com/' + str(re.findall('\d+', item.get_text())[0]) + '__xingzhengquhua/'
print(url)
strhtml = requests.get(url)
soup = BeautifulSoup(strhtml.text, 'lxml')
data = soup.select('#main_content > table > tr > td > table > tr')
print(data[3:])
for item in data[3:]:
time.sleep(1)
result = {
'title': re.findall('\D+', item.get_text()),
'ID': str(re.findall('\d+', item.get_text()))[2:14],
'type': str(re.findall('\d+', item.get_text()))[14:17]
}
print(result)
with open(file_name, 'a') as file_obj:
file_obj.write(result['ID'] + ',' + str(result['title'][0]) + ',' + result['type'] + ',6' + '\n')
print('程序执行结束')
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/javasqlbug/pythonProject.git
git@gitee.com:javasqlbug/pythonProject.git
javasqlbug
pythonProject
pythonProject
master

Search

23e8dbc6 1850385 7e0993f3 1850385