2 Star 0 Fork 0

crossin/crawler

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
ex3_zhihu_v3.py 2.11 KB
一键复制 编辑 原始数据 按行查看 历史
Crossin Yuan 提交于 2017-05-08 16:16 . update
import requests
import threading
import csv
url_1 = 'https://www.zhihu.com/api/v4/members/'
url_2 = '/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&offset='
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Host': 'www.zhihu.com',
'Referer': 'https://www.zhihu.com/people/zhang-jia-wei/following?page=2',
'Cookie': 'xxxx'
}
to_crawl = ['crossin']
crawled = []
all_user = []
finished = threading.Event()
def crawl(url):
global to_crawl, crawled, finished
req = requests.get(url=url, headers=headers)
data = req.json()
for user in data['data']:
if user['follower_count'] > 600000:
token = user['url_token']
if token not in to_crawl and token not in crawled:
print(user['name'])
to_crawl.append(token)
all_user.append([token, user['name'], user['follower_count'],user['is_following']])
print('add token', token)
finished.set()
return data['paging']
def get_following(user):
print('crawling', user)
global to_crawl, crawled
url = url_1 + user + url_2 + '0'
paging = crawl(url)
totals = paging['totals']
count = 20
while count < totals and count < 1000:
url = url_1 + user + url_2 + str(count)
t = threading.Thread(target=crawl, args=(url,))
t.start()
count += 20
print('to_crawl', to_crawl)
print('crawled', crawled)
while len(to_crawl) > 0:
user = to_crawl.pop()
crawled.append(user)
get_following(user)
while len(to_crawl) == 0 and threading.active_count() > 1:
print(to_crawl, crawled)
print('wait', threading.active_count())
finished.clear()
finished.wait(3)
with open('zhihuV.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['token','昵称','关注数','是否关注'])
for data in all_user:
writer.writerow(data)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/crossin/crawler.git
git@gitee.com:crossin/crawler.git
crossin
crawler
crawler
master

搜索帮助