代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
import urllib, os, pymysql, time,re
ISOTIMEFORMAT='%Y-%m-%d %X' #Time setup
def saveImgs(driver, img_path, img_url_list):
img_num = 0
if not os.path.exists(img_path): ###判断文件是否存在,返回布尔值
os.makedirs(img_path)
while img_num < len(img_url_list):
image_url = img_url_list[img_num]
save_path = img_path + str(img_num) + '.jpg'
urllib.request.urlretrieve(image_url, save_path)
img_num = img_num + 1
return img_num
def craw_product_contents(product_url):
product_info_list = []
# driver = webdriver.PhantomJS()
# driver = webdriver.Firefox()
driver = webdriver.Chrome()
driver.get(product_url)
driver.maximize_window()
#change the local country
country_element = driver.find_element(By.XPATH, value="//*[@id='chrome-header']/header/div[2]/div/ul/li[3]/div/button")
country_element.click()
driver.implicitly_wait(4) # wait 1 seconds.
country_element = driver.find_element(By.XPATH, value="//*[@id='country']/option[97]").click()
driver.implicitly_wait(4) # wait 1 seconds.
url_product_id = re.findall(r'[0-9]{7}',product_url)[0]
product_info_list.append(url_product_id)
#show more
show_more = driver.find_element(By.XPATH,
value="//*[@id='chrome-modal-container']/div[1]/div[2]/div/div/section/form/div[3]/button")
if show_more.is_enabled():
show_more.click()
driver.implicitly_wait(4) # wait 1 seconds.
#breadcrumb
breadcrumb = ''
breadcrumb_eles = driver.find_elements(By.XPATH, value="//div[@id='chrome-breadcrumb']/div/nav/ol/li")
for breadcrumb_ele in breadcrumb_eles:
breadcrumb = breadcrumb + breadcrumb_ele.text + '/'
breadcrumb = breadcrumb.strip('/')
product_info_list.append(breadcrumb)
#product URL
product_info_list.append(product_url)
# URL state is 1
product_url_stat = 1
product_info_list.append(product_url_stat)
#product code
# product_code =''
# product_code = driver.find_element(By.XPATH, value="//*[@id='productDescriptionDetails']/div/p").text
# product_info_list.append(product_code)
#product website
product_website = 'http://www.asos.com/'
product_info_list.append(product_website)
#product_brand
product_brand = 'ASOS'
product_info_list.append(product_brand)
#product_craw_time 爬取产品时间
product_craw_time = time.strftime(ISOTIMEFORMAT,time.localtime(time.time())) #获取当前时区时间格式 2016-08-02 21:46:38
product_info_list.append(product_craw_time)
#product title
product_title = ''
product_title = driver.find_element(By.XPATH, value="//div[@id='pdp-react-critical-app']/span[1]/h1").text
product_info_list.append(product_title)
#product delivery
product_delivery = ''
product_delivery = driver.find_element(By.XPATH,
value="//*[@id='pdp-react-critical-app']/span[4]/div[5]/div[2]").text
product_info_list.append(product_delivery)
#product price
product_price = 0
product_price = driver.find_element(By.XPATH, value="//span[@class='ky6t2']").text
product_info_list.append(product_price)
#product select size can be null
size = ''
product_size = driver.find_elements(By.XPATH, value="//*[@id='variantSelector']/option")
for ele in product_size:
if 'Not' not in ele.text and 'Please' not in ele.text:
size = size + ele.text + ';;'
size = size.strip(';;')
product_info_list.append(size)
#product colour
product_colour = ''
product_colour = driver.find_element(By.XPATH, value="//*[@id='pdp-react-critical-app']/span[4]/div[2]/div/p").text
product_info_list.append(product_colour)
#product IMGs
img_url_list = []
ele_imgs = driver.find_elements(By.XPATH, value="//img[@class='gallery-image']")
for ele in ele_imgs:
img_url_list.append(ele.get_attribute("src"))
img_url_list = list(set(img_url_list))
img_path = 'Unclassified'
if len(breadcrumb) > 0:
img_path = '/'.join(breadcrumb.split('/')[0:-1])
img_number = saveImgs(driver, ROOTPATH + breadcrumb + '/' + str(url_product_id) + "/", img_url_list)
product_info_list.append(img_number)
# threre are at most 3 right-arrow button, click it if it is clickable
right_arrows = driver.find_elements(By.XPATH, value="//button[@class='arrow-button arrow-button-right']")
if len(right_arrows) == 2:
right_arrows[1].click()
right_arrows[1].click()
right_arrows[1].click()
right_arrows[1].click()
if len(right_arrows) == 3:
right_arrows[1].click()
right_arrows[1].click()
right_arrows[1].click()
right_arrows[1].click()
#you may also like
you_may_also_like_list = ''
like_list = []
you_may_also_like = driver.find_element(By.XPATH, value="//*[@id='mightLikeContainer']/section/ul")
you_may_also_like_url = you_may_also_like.find_elements(By.XPATH, value="//li[@class='SuUpL']/div/a")
for ele in you_may_also_like_url:
if ele.get_attribute('href') is not None and 'recommend' in ele.get_attribute('href'):
if ele.get_attribute('href') not in like_list:
like_list.append(ele.get_attribute('href'))
you_may_also_like_list = ';;'.join(like_list)
product_info_list.append(you_may_also_like_list)
text_content = [repr(str(i)) for i in product_info_list]
with open('text_content.txt', 'a', encoding='utf8') as f:
f.write('\t'.join(text_content) + '\n')
# product_details_data = (url_product_id, breadcrumb, product_url, product_url_stat, product_code, product_website,
# gender, product_brand, product_craw_time,
# product_title, product_delivery, product_price, product_description, size,
# product_care, product_colour, img_number, buy_the_look_list, you_may_also_like_list)
# driver.quit()
return product_info_list
def store_in_database(product_data):
#start of database updating
sql_update_content = """\
INSERT INTO testdb.product(
product_breadcrumbs,
product_url,
product_url_stat,
product_sku,
product_website,
product_gender,
product_brand,
product_craw_time,
product_title,
product_estimated_delivery_time,
product_price,
product_desc,
product_stock_hint,
product_size_detail1,
product_size_detail2,
product_img_number,
product_similar,
product_match)
VALUES
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
#
# cursor.execute(sql_update_content, product_data)
# db.commit()#需要这一句才能保存到数据库中
if __name__ == '__main__':
product_URLs = open("product_url_women.txt")
ROOTPATH = "ASOS/"
# db = pymysql.connect("localhost","root","Yyb520992!","testdb", charset="utf8")
# cursor = db.cursor()
for product_url in product_URLs:
product_data = craw_product_contents(product_url)
print(product_data)
# try:
# product_data = craw_product_contents(driver)
# print(product_data)
# # store_in_database(product_data)
# except:
# print("crawl page content error")
product_URLs.close()
# db.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。