1 Star 3 Fork 0

萧石/public_data

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
maigoo_test.py 7.92 KB
一键复制 编辑 原始数据 按行查看 历史
undefined 提交于 2024-11-19 16:05 . 首次上传
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName :maigoo_test.py
# @Time :2023/11/17
# @Author :CL
# @email :1037654919@qq.com
# 遍历获取 maigoo的所有网址
# URL = 'https://www.maigoo.com/'
import re
import requests
from bs4 import BeautifulSoup
from utils import mongo_manager
maigoo_catalog = mongo_manager('maigoo_catalog',db ='public_data')
headers = {
"authority": "www.maigoo.com",
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://www.maigoo.com/",
"sec-ch-ua": "\"Not.A/Brand\";v=\"8\", \"Chromium\";v=\"114\", \"Google Chrome\";v=\"114\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Linux\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"x-requested-with": "XMLHttpRequest"
}
cookies = {
"u": "e3c9bfa3a2091ee9599fc6277636b942",
"PHPSESSID": "91a879ed4e32c9f50c48a4b593e39c51",
"history": "cid%3A2476"
}
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:73.0) Gecko/20100101 Firefox/73.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0'
]
def get_main(url = "https://www.maigoo.com/"):
cookies = {
"u": "e3c9bfa3a2091ee9599fc6277636b942",
"PHPSESSID": "91a879ed4e32c9f50c48a4b593e39c51",
"Hm_lvt_de3f6fd28ec4ac19170f18e2a8777593": "1700197848",
"Hm_lpvt_de3f6fd28ec4ac19170f18e2a8777593": "1700197848",
"history": "cid%3A2476%2Ccid%3A1746%2Ccid%3A4871"
}
response = requests.get(url, headers=headers, cookies=cookies)
# print(response.text)
print(response)
soups = BeautifulSoup(response.text, "lxml")
zhishi = soups.find("div", class_="zhishi catmenu").find_all('li') # 分类
brands = soups.find("div", class_="brand catmenu").find_all('li') # 分类
lists=[]
for brand in zhishi:
name = brand.find('a').get_text()
href = brand.find('a').get('href')
lists.append({'level':1,"type":'zhishi',"name":name,"href":href})
for brand in brands:
name = brand.find('a').get_text()
href = brand.find('a').get('href')
lists.append({'level': 1,"type":'brand', "name": name, "href": href})
return lists
def save_matalog():
lists = get_main()
for l in lists:
l["_id"] = l['href']
try:
maigoo_catalog.insertOne(l)
except Exception as e:
print(e)
url = l['href']
response = requests.get(url, headers=headers, cookies=cookies)
print(response.url, response)
soups = BeautifulSoup(response.text, "lxml")
try:
scroll = soups.find("div", class_="hotcat").find("div", class_="scroll").find_all('div', class_='list') # hotcat
for s in scroll:
ttl_name = s.find("div", class_="ttl").find("a").get_text()
ttl_href = s.find("div", class_="ttl").find("a")['href']
data = {'level': 2, "name": ttl_name, "href": ttl_href, "parent_href": url}
data["_id"] = data['href']
data["type"] = l['type']
try:
maigoo_catalog.insertOne(data)
except Exception as e:
print(e)
for a in s.find_all('a'):
data = {'level': 3, "name": a.get_text(), "href": a['href'], "parent_href": ttl_href}
data["_id"] = data['href']
try:
maigoo_catalog.insertOne(data)
except Exception as e:
print(e)
except Exception as e:
print(e)
def get_brand10(url = "https://www.maigoo.com/brand10/"):
response = requests.get(url, headers=headers, cookies=cookies)
print(response.url,response)
lists = []
if response.status_code == 200:
soups = BeautifulSoup(response.text, 'lxml')
itembox = soups.find("div", class_="itembox").find_all('li')
itembox += soups.find_all("div", class_="itembox")[-1].find_all('li')
for it in itembox:
try:
name = it.find('a').get_text().strip().split('\n')[0]
href = it.find('a')['href']
lists.append({'name':name,'href':href})
except:
pass
return lists
def get_category_brand(url = "https://www.maigoo.com/category/brand/"):
response = requests.get(url, headers=headers, cookies=cookies)
print(response.url,response)
lists = []
if response.status_code == 200:
soups = BeautifulSoup(response.text, 'lxml')
lis = soups.find("div", id="container").find("div", class_="itembox").find_all('a')
for it in lis:
try:
name = it.get_text()
href = it['href']
lists.append({'name':name,'href':href})
except:
pass
return lists
def get_urls(seed):
url = seed['href']
response = requests.get(url, headers=headers, cookies=cookies)
print(response.url, response)
lists = []
if response.status_code == 200:
soups = BeautifulSoup(response.text, 'lxml')
lis = soups.find_all("a", href=True)
for it in lis:
try:
name = it.get_text().strip().split('\n')[0].split(' ')[0]
href = it['href']
href = re.split('#|\?',href)[0]
if 'https://www.maigoo.com/' in href:
if maigoo_catalog.not_exist({"_id":href}) and maigoo_catalog.not_exist({"name":name}):
data = {'name': name, 'href': href,'parent_href':url}
if 'level' in seed:
data['level'] = seed['level'] + 1
else:
data['leve'] = 2
lists.append(data)
except:
pass
return lists
# 遍历获取 maigoo的所有网址
def main():
while True:
seeds = maigoo_catalog.findAll({'status':None,'level':2})
search_list =[]
for seed in seeds:
search_list.append(seed)
print('len(search_list)',len(search_list))
for seed in search_list:
lists=get_urls(seed)
if lists:
for l in lists:
l["_id"] = l['href']
try:
maigoo_catalog.insertOne(l)
print(l['name'],l['href'])
except Exception as e:
pass
# print(e)
seed['status'] = 'success'
maigoo_catalog.updateOne({'_id':seed['_id']},seed)
if len(search_list)<10:
break
def delete_data():
seed = maigoo_catalog.findAll('')
for s in seed:
if '#' in s['href'] or '?' in s['href']:
print(s)
maigoo_catalog.deleteOne(s)
maigoo_catalog.close()
if __name__ == "__main__":
print()
# json_data = get_china_json()
# save_matalog()
# lists = get_main()
# lists = get_category_brand(url='https://www.maigoo.com/category/zhishi/')
main()
# delete_data()
maigoo_catalog.close()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/beihai_xiaoshi/public_data.git
git@gitee.com:beihai_xiaoshi/public_data.git
beihai_xiaoshi
public_data
public_data
master

搜索帮助