代码拉取完成,页面将自动刷新
同步操作将从 zhenghua/Scrpay 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup,NavigableString, Tag
from pyquery import PyQuery as pq
import re
from urllib.parse import urlencode
import json
import base64
import os
import requests
import pytesseract
from PIL import Image
# response = urllib.request.urlopen('https://www.python.org')
# print(response.read().decode('utf-8'))
# #<class 'http.client.HTTPResponse'>,返回值为http.client.HTTPResponse类型,主要包含方法read(),readinto(),getheader(name),
# #getheaders(),fileno()等方法,以及属性msg,version,status,reason,debuglevel,closed等
# print(type(response))
# print(response.getheader('Server'))
# print(response.getheaders())
# print(response.msg)
# print(response.status)
# print(response.reason)
# import urllib.parse
# import urllib.request
#这里我们传递了一个参数word,值是hello。它需要被转码成bytes(字节流)类型。
#其中转字节流采用了bytes()方法,该方法的第一个参数需要是str(字符串)类型,
#需要用urllib.parse模块里的urlencode()方法来将参数字典转化为字符串;第二个参数指定编码格式,这里指定为utf8
# response = urllib.request.urlopen('http://httpbin.org/post',data=data)
# print(response.read().strip())
# print(response.getheader('Host'))
# print(type(urllib.parse.urlencode({'word':'hello'})))
# print(urllib.parse.urlencode({'word':'hello'}))
# import urllib.parse
# base_url = 'http://www.baidu.com?'
# data = {'word':'hello','key':'value'}
# url = base_url + urllib.parse.urlencode(data)
# print(url)
# import urllib.request
# import urllib.error
# import socket
# #如果在0.1秒未收到相应,则抛出异常
# try:
# response = urllib.request.urlopen('https://www.python.org',timeout=2)
# print(response.geturl())
# print(response.info())
# print(response.getcode())
# except urllib.error.URLError as e:
# if isinstance(e.reason,socket.timeout):
# print('time out ')
# import urllib.request
#
# request = urllib.request.Request('https://www.python.org')
# #虽然还是使用urlopen发送请求,但是参数变成了request类,可以构造更复杂的参数
# #headers = {'user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
# request.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36')
# reponse = urllib.request.urlopen(request)
# print(reponse.read().decode('utf-8'))
# from urllib import request,parse
#
# url = 'http://httpbin.org/post'
# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
# 'host':'httpbin.org'
# }
# dict = {
# 'name':'germey'
# }
# data = bytes(parse.urlencode(dict),encoding='utf8')
# req = request.Request(url=url,data=data,headers=headers,method='POST')
# response=request.urlopen(req)
# print(response.read().decode('utf-8'))
# from urllib.error import URLError
# from urllib.request import ProxyHandler,build_opener
# #使用ProxyHandler,其参数是一个字典,键名是协议类型,键值是代理链接,可以添加多个代理
# proxy_handler = ProxyHandler({
# 'http':'http://127.0.0.1:9743',
# 'https':'https://127.0.0.1:9743'
# })
# opener = build_opener(proxy_handler)
# try:
# response = opener.open('https://www.baidu.con')
# print(response.read().decode('utf-8'))
# except URLError as e:
# print(e.reason)
#在控制台输出cookie
# import http.cookiejar,urllib.request
# cookie = http.cookiejar.CookieJar()
# handler = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(handler)
# response = opener.open('http://www.baidu.com')
# print(response.read().decode('utf-8'))
# for item in cookie:
# print(item.name + '=' + item.value)
#在日志中输出cookie
# import http.cookiejar,urllib.request
# filename = 'cookie.txt'
# cookie = http.cookiejar.MozillaCookieJar(filename)
# handler = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(handler)
# response = opener.open('http://www.baidu.com')
# cookie.save(ignore_discard=True,ignore_expires=True)
# import http.cookiejar,urllib.request
# filename = 'cookieLWP.txt'
# cookie = http.cookiejar.LWPCookieJar(filename)
# handler = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(handler)
# response = opener.open('http://www.baidu.com')
# cookie.save(ignore_discard=True,ignore_expires=True)
#cookie读取
#这里调用load()方法读取本地cookie文件,获取到了cookies的内容。
#不过前提是我们首先生成了LWPCookieJar格式的cookies文件,并保存在了本地
# import http.cookiejar,urllib.request
# cookie = http.cookiejar.LWPCookieJar()
# cookie.load('cookieLWP.txt',ignore_discard=True,ignore_expires=True)
# handler = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(handler)
# response = opener.open('http://www.baidu.com')
# print(response.read().decode('utf-8'))
#
# from urllib import request,error
# #因为HTTPError的父类是URLError,所以可以先捕获HTTPError,如果不是HTTPError,在捕获URLError,输出错误原因,
# # 最后用else处理正确的情况
# try:
# response = request.urlopen('http://tool.oschina.net/codeformat/json')
# print(response.read())
# except error.HTTPError as e:
# print(e.reason,e.code,e.headers,sep = '\n')
# except error.URLError as e:
# print(e.reason)
# else:
# print('request sucessful !')
# from urllib.parse import urlunparse
# data = ['http','www.baidu.com','index.html','user','a=6','comment']
# print(urlunparse(data))
# from urllib.parse import urlparse
# result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
# print(type(result), result)
# from urllib.parse import urlunparse
# data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
# print(urlunparse(data))
# from urllib.parse import urlsplit
#
# result = urlsplit('http://www.baidu.com/index.html;user?id=5#comment')
# print(result)
# from urllib.parse import urljoin
#
# print(urljoin('http://www.baidu.com', 'FAQ.html'))
# print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))
# print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html'))
# print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2'))
# print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php'))
# print(urljoin('http://www.baidu.com', '?category=2#comment'))
# print(urljoin('www.baidu.com', '?category=2#comment'))
# print(urljoin('www.baidu.com#comment', '?category=2'))
# from urllib.parse import urlencode
#
# params = {
# 'name': 'germey',
# 'age': 22
# }
# base_url = 'http://www.baidu.com?'
# url = base_url + urlencode(params)
# print(url)
# from urllib.parse import parse_qs
# import json
# query = 'name=germey&age=22'
#
# dict = parse_qs(query)
# print(dict)
# print(type(dict))
#
# str_json = json.dumps(dict)
#
# print(str_json,type(str_json))
# from urllib.parse import parse_qsl
#
# query = 'name=germey&age=22'
# print(parse_qsl(query))
# from urllib.parse import quote,unquote
#
# keyword = '壁纸'
# url = 'https://www.baidu.com/s?wd=' + quote(keyword)
# print(url)
#
# url1 = unquote(url)
# print(url1)
# import requests
# #调用get()方法实现与urlopen()相同的操作,得到一个response对象,在输出它的类型,状态码,响应体的类型等。
# r = requests.get('https://www.baidu.com/')
# # print(r.text.encode(encoding='utf-8'))
# # print(type(r.text))
# # print(type(r))
# # print(r.status_code)
# print(r.cookies)
# #get请求
# #请求的连接会自动被构建成 http://httpbin.org/get?name=germey&age=22,
# #另外,网页的返回格式是str类型的,但是它其实是json格式的,所以,如果想直接解析返回结果,得到一个字典的话,
# # 可使用json()进行转化,转化之后为json类型的字典类型
# import requests
# data = {
# 'name': 'germey',
# 'age': 22
# }
# r = requests.get("http://httpbin.org/get", params=data)
# print(r.text)
# print(type(r.text))
# print(r.json())
# print(type(r.json()))
# import requests
# import re
#
# # headers = {
# # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
# # }
# r = requests.get("https://www.zhihu.com/explore") #, headers=headers)
# pattern = re.compile('explore-feed.*?question_link.*?>(.*?)</a>', re.S)
# titles = re.findall(pattern, r.text)
# print(titles)
# import requests
#
# r = requests.get("https://github.com/favicon.ico")
# print(r.content) #显示二进制数requests.get('https://www.baidu.com/')据
# print(r.text) #图片为二进制,内容打印会乱码
# #存储图片到本地
# with open('favicon.ico','wb') as f:
# f.write(r.content)
# import requests
# #可以发现,成功获得了返回了结果,其中form部分就是提交的表单,证明post请求发送成功
# data = {'name':'germey','age':'22','sex':'nan'}
# r = requests.post('http://httpbin.org/post',data=data)
# print(r.text)
#
# import requests
# #这里分别打印输出status_code属性得到状态码,输出headers属性得到响应头,输出cookies属性得到Cookies,输出url属性得到URL,输出history属性得到请求历史
# r = requests.get('http://www.baidu.com')
# print(type(r.status_code), r.status_code)
# print(type(r.headers), r.headers)
# print(type(r.cookies), r.cookies)
# print(type(r.url), r.url)
# print(type(r.history), r.history)
# import requests
# r = requests.get('https://www.baidu.com')
# print(r.cookies)
# for key,value in r.cookies.items():
# print(key + '=' + value)
# import requests
#
# headers = {
# 'Cookie':'_zap=7c81e1ab-7638-4d89-a8df-7f4f73d5d318; '
# '_xsrf=kBJPeG3JoKt8RyDfotQ5tumQD56YX6TX; '
# 'd_c0="ABDil0kyuA6PTjVQ7IwneotUmWxXWnJH-rU=|1545659382"; '
# 'capsion_ticket="2|1:0|10:1545659451|14:capsion_ticket|44:ZGZmZTBhNDBmYzJiNDY3ODgxYmUxMjViOWVmOTBiOTM=|08667171cafa1b1e4dc87a108af9cb4fafcdb1a5142070981ee56def83292c45"; '
# 'z_c0="2|1:0|10:1545659477|4:z_c0|92:Mi4xZW1vbkRRQUFBQUFBSUNLQ1NUSzREaVlBQUFCZ0FsVk5WVElPWFFBSm1EN1JRUHZlUzFiYTdwcWJIY3haOXZxQjFR|ec049be6c582d5a0d25bf8c6df84b9182a25f15df13b47bec1bf2b0850e9fca1"; '
# 'tst=r; q_c1=d9dbf3826c314a8cb47c0ea608032745|1545659479000|1545659479000;'
# ' __utma=51854390.459793879.1545659575.1545659575.1545659575.1;'
# ' __utmc=51854390; __utmz=51854390.1545659575.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; '
# '__utmv=51854390.100--|2=registration_date=20181113=1^3=entry_date=20181113=1; '
# 'tgw_l7_route=ec452307db92a7f0fdb158e41da8e5d8',
# 'Host':'https://www.zhihu.com/',
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
# }
# r = requests.get('https://www.zhihu.com',headers = headers)
#
# print(r.text)
# import requests
#
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
# }
# r = requests.get("https://www.zhihu.com/explore", headers=headers)
# print(r.text)
# import requests
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
# }
# files = {'file': open('favicon.ico', 'rb')}
# r = requests.post("http://httpbin.org/post", files=files,headers=headers)
# print(r.text)
# import requests
#
# r = requests.get("http://tool.oschina.net/commons?type=5")
# print(r.cookies)
# for key, value in r.cookies.items():
# print(key + '=' + value)
# import requests
#
# cookies = 'q_c1=31653b264a074fc9a57816d1ea93ed8b|1474273938000|1474273938000; d_c0="AGDAs254kAqPTr6NW1U3XTLFzKhMPQ6H_nc=|1474273938"; __utmv=51854390.100-1|2=registration_date=20130902=1^3=entry_date=20130902=1;a_t="2.0AACAfbwdAAAXAAAAso0QWAAAgH28HQAAAGDAs254kAoXAAAAYQJVTQ4FCVgA360us8BAklzLYNEHUd6kmHtRQX5a6hiZxKCynnycerLQ3gIkoJLOCQ==";z_c0=Mi4wQUFDQWZid2RBQUFBWU1DemJuaVFDaGNBQUFCaEFsVk5EZ1VKV0FEZnJTNnp3RUNTWE10ZzBRZFIzcVNZZTFGQmZn|1474887858|64b4d4234a21de774c42c837fe0b672fdb5763b0'
# jar = requests.cookies.RequestsCookieJar()
# headers = {
# 'Host': 'www.zhihu.com',
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
# }
# for cookie in cookies.split(';'):
# key, value = cookie.split('=',1)
# print(key,value)
# jar.set(key, value)
# txt = "Google#Runoob#Taobao#Facebook"
#
# # 第二个参数为 1,返回两个参数列表
# x = txt.split("#", -1)
# print(x)
# import requests
#
# response = requests.get('https://www.12306.cn',verify=False)
# print(response.status_code)
# import requests
# import json
#
# data = {'some': 'data'}
# headers = {'content-type': 'application/json',
# 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
#
# r = requests.post('https://api.github.com/some/endpoint', data=data, headers=headers)
# print(r.text)
# import re
#
# content = 'Hello 1234567 World_This is a Regex Demo'
# result = re.match('^Hello\s(\d+)\sWorld', content)
# print(result)
# print(result.group())
# print(result.group(1))
# print(result.span())
# import requests
#
# files = {'file': open('favicon.ico', 'rb')}
# r = requests.post("http://httpbin.org/post", files=files)
# print(r.text)
# import requests
#
# requests.get('http://httpbin.org/cookies/set/number/123456789')
# r = requests.get('http://httpbin.org/cookies')
# print(r.text)
#
# import requests
#
# s = requests.Session()
# s.get('http://httpbin.org/cookies/set/number/123456789')
# r = s.get('http://httpbin.org/cookies')
# print(r.text)
# from requests import Request, Session
#
# url = 'http://httpbin.org/post'
#
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
# }
# s = Session()
# req = Request('POST', url, data=data, headers=headers)
# prepped = s.prepare_request(req)
# r = s.send(prepped)
# print(r.text)
# import requests
#
# headers = {
# 'Cookie': '_ga=GA1.2.1564052695.1545472099; user_trace_token=20181222174726-96d1cc6e-05ce-11e9-88d7-525400f775ce; LGUID=20181222174726-96d1cfb9-05ce-11e9-88d7-525400f775ce; WEBTJ-ID=20181226204006-167ea877c111ea-04617311d8f99d-444a022e-1049088-167ea877c122b6; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545472099,1545828007; LGSID=20181226203906-3bf5387f-090b-11e9-ad84-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xb7520b3800027cc5%26issp%3D1%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3D%26tn%3D21002492_20_hao_pg%26ch%3D%26rsv_enter%3D1%26inputT%3D2539; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; _gid=GA1.2.562376411.1545828007; JSESSIONID=ABAAABAAADEAAFI42C8637613209C017DE124D1096A2AA6; X_HTTP_TOKEN=041acf056623476a59311944761218a1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167ea87c48db9-0394bf9a87f259-444a022e-1049088-167ea87c48e99%22%2C%22%24device_id%22%3A%22167ea87c48db9-0394bf9a87f259-444a022e-1049088-167ea87c48e99%22%7D; sajssdk_2015_cross_new_user=1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=56; index_location_city=%E5%8C%97%E4%BA%AC; LG_LOGIN_USER_ID=273eed1f8806df1709e52878a2b4b7726fbd8bb12f906362; _putrc=3C0FA0629129C357; login=true; unick=%E6%A2%81%E6%96%B0%E6%96%8C; gate_login_token=f9a45f55f7a8ab49d7478ca818e4a2f17e3cf28c3bb52c73; TG-TRACK-CODE=index_navigation; SEARCH_ID=dab4f23d2f81447da7dad556ffab7503; LGRID=20181226204356-e9007c2f-090b-11e9-ad84-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545828298',
# 'Host': 'www.lagou.com',
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
# }
# r = requests.get('https://www.lagou.com/zhaopin/Python/?labelWords=label', headers = headers )
# print(r.text)
# from lxml import etree
# # text = '''
# # <div>
# # <ul>
# # <li class="item-0"><a href="link1.html">first item</a></li>
# # <li class="item-1"><a href="link2.html">second item</a></li>
# # <li class="item-inactive"><a href="link3.html">third item</a></li>
# # <li class="item-1"><a href="link4.html">fourth item</a></li>
# # <li class="item-0"><a href="link5.html">fifth item</a>
# # </ul>
# # </div>
# # '''
# # html = etree.HTML(text)
# # result = etree.tostring(html)
# # html1 = result.decode('utf-8')
# # print(html1)
# # # html2 = etree.parse(html1, etree.HTMLParser())
# # # print(html2)
# # # result1 = html2.xpath('//*')
# # # print(result1)
# from lxml import etree
# html = etree.parse('./test.html', etree.HTMLParser())
# result = html.xpath('/li')
# print(type(result))
# print(result)
# from lxml import etree
#
# html = etree.parse('./test.html', etree.HTMLParser())
# result = html.xpath('//a[@href="link4.html"]/../@class')
# print(result)
# from lxml import etree
#
# html = etree.parse('./test.html', etree.HTMLParser())
# result = html.xpath('//li/a/@href')
# print(result)
# from lxml import etree
# text = '''
# <li class="li li-first" name="item"><a href="link.html">first item</a></li>
# '''
# html = etree.HTML(text)
# result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
# print(result)
# from lxml import etree
#
# text = '''
# <div>
# <ul>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-inactive"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a>
# </ul>
# </div>
# '''
# html = etree.HTML(text)
# result = html.xpath('//li[2]/a/text()')
# print(result)
# result = html.xpath('//li[last()]/a/text()')
# print(result)
# result = html.xpath('//li[position()<3]/a/text()')
# print(result)
# result = html.xpath('//li[last()-2]/a/text()')
# print(result)
# from lxml import etree
# html = etree.parse('./lagou.html', etree.HTMLParser())
# #获取公司名称
# company_name = html.xpath('//li/@data-company')
# #获取置位名称
# position_name = html.xpath('//li/@data-positionname')
# #获取薪水数据
# salary = html.xpath('//li/@data-salary')
# tup = list(zip(company_name,position_name,salary))
# print(tup)
# from lxml import etree
# #比如,这里如果要选取class为item-1的li节点,可以这样实现
# #这里我们通过加入[@class="item-0"],限制了节点的class属性为item-0,
# # 而HTML文本中符合条件的li节点有两个,所以结果应该返回两个匹配到的元素
# html = etree.parse('./test.html', etree.HTMLParser())
# result = html.xpath('//li[@class="item-0"]')
# print(result)
# from lxml import etree
#
# html = etree.parse('./test.html',etree.HTMLParser())
# #获取到的是换行符,因为 / 是选取直接子节点,而li的直接子节点为a节点,
# # 而文本都在a节点中,所以匹配到的是li内部的换行符了
# result = html.xpath('//li[@class="item-0"]/text()')
# #如果想要获取子孙节点内部的所有文本,可以直接使用 // 加 text()的方式,这样可以保证获取到最全面的数据
# #但有时候可能会夹杂一些换行符等数据需要处理
# #也可以按照逐层查找的方式获取文本数据
# #按照逐层查找的方式获取文本数据
# result1 = html.xpath('//li[@class="item-0"]/a/text()')
# #获取li[@class="item-0"] 节点下的所有文本数据
# result2= html.xpath('//li[@class="item-0"]//text()')
# print(result1)
# print(result2)
# #我们通过@href即可获取节点的href属性。注意,此处和属性匹配的方法不同,
# #属性匹配是在中括号中加属性名来限定某个属性,如[@href="link1.html"],
# #而这里的@href指的是获取节点的某个属性,返回结构为列表形式。
# from lxml import etree
# #获取li节点下所有a节点的href属性
# html = etree.parse('./test.html',etree.HTMLParser())
# result = html.xpath('//li/a/@href')
# print(result)
# #有时候,某些节点的某个属性可能有多个值
# from lxml import etree
# text = '''
# <li class="li li-first"><a href="link.html">first item</a></li>
# '''
# html = etree.HTML(text)
# result = html.xpath('//li[@class="li"]/a/text()')
# print(result)
#
# #这里HTML文本中li节点的class属性有两个值li和li-first,此时如果还想用之前的属性匹配获取,就无法匹配了,
# #运行结果为空
# #这时就需要用contains()函数了
# from lxml import etree
# text = '''
# <li class="li li-first"><a href="link.html">first item</a></li>
# '''
# html = etree.HTML(text)
# result = html.xpath('//li[contains(@class, "li")]/a/text()')
# print(result)
#
# #这样通过contains()方法,第一个参数传入属性名称,第二个参数传入属性值,只要此属性包含所传入的属性值,就可以完成匹配了
# #此种方式在某个节点的某个属性有多个值时经常用到,如某个节点的class属性通常有多
#另外,我们可能还遇到一种情况,那就是根据多个属性确定一个节点,这时就需要同时匹配多个属性。
# 此时可以使用运算符and来连接,示例如下:
# from lxml import etree
# text = '''
# <li class="li li-first" name="item"><a href="link.html">first item</a></li>
# '''
# html = etree.HTML(text)
# result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
# print(result)
# #这里的li节点又增加了一个属性name。要确定这个节点,需要同时根据class和name属性来选择,
# # 一个条件是class属性里面包含li字符串,另一个条件是name属性为item字符串,二者需要同时满足,
# # 需要用and操作符相连,相连之后置于中括号内进行条件筛选
# #这里的and其实是XPath中的运算符。另外,还有很多运算符,如or、mod等
# http://www.w3school.com.cn/xpath/xpath_operators.asp。
# #有时候,我们在选择的时候某些属性可能同时匹配了多个节点,但是只想要其中的某个节点,
# # 如第二个节点或者最后一个节点,
#
# #这时可以利用中括号传入索引的方法获取特定次序的节点,例如:
# from lxml import etree
#
# text = '''
# <div>
# <ul>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-inactive"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a>
# </ul>
# </div>
# '''
# html = etree.HTML(text)
# result = html.xpath('//li[1]/a/text()')
# print(result)
# result = html.xpath('//li[last()]/a/text()')
# print(result)
# result = html.xpath('//li[position()<3]/a/text()')
# print(result)
# result = html.xpath('//li[last()-2]/a/text()')
# print(result)
#
# #第一次选择时,我们选取了第一个li节点,中括号中传入数字1即可。注意,这里和代码中不同,序号是以1开头的,不是以0开头。
#
# #第二次选择时,我们选取了最后一个li节点,中括号中传入last()即可,返回的便是最后一个li节点。
#
# #第三次选择时,我们选取了位置小于3的li节点,也就是位置序号为1和2的节点,得到的结果就是前两个li节点。
#
# #第四次选择时,我们选取了倒数第三个li节点,中括号中传入last()-2即可。因为last()是最后一个,所以last()-2就是倒数第三个
# #这里我们使用了last()、position()等函数。在XPath中,提供了100多个函数,包括存取、数值、字符串、逻辑、节点、序列等处理功能,
# #它们的具体作用可以参考:http://www.w3school.com.cn/xpath/xpath_functions.asp
#XPath提供了很多节点轴选择方法,包括获取子元素、兄弟元素、父元素、祖先元素等
# from lxml import etree
#
# text = '''
# <div>
# <ul>
# <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-inactive"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a>
# </ul>
# </div>
# '''
# html = etree.HTML(text)
# result = html.xpath('//li[1]/ancestor::*')
# print(result)
# result = html.xpath('//li[1]/ancestor::div')
# print(result)
# result = html.xpath('//li[1]/attribute::*')
# print(result)
# result = html.xpath('//li[1]/child::a[@href="link1.html"]')
# print(result)
# result = html.xpath('//li[1]/descendant::span')
# print(result)
# result = html.xpath('//li[1]/following::*[2]')
# print(result)
# result = html.xpath('//li[1]/following-sibling::*')
#
# #第一次选择时,我们调用了ancestor轴,可以获取所有祖先节点。其后需要跟两个冒号,然后是节点的选择器,这里我们直接使用*,表示匹配所有节点,因此返回结果是第一个li节点的所有祖先节点,包括html、body、div和ul。
# #第二次选择时,我们又加了限定条件,这次在冒号后面加了div,这样得到的结果就只有div这个祖先节点了。
# #第三次选择时,我们调用了attribute轴,可以获取所有属性值,其后跟的选择器还是*,这代表获取节点的所有属性,返回值就是li节点的所有属性值。
# #第四次选择时,我们调用了child轴,可以获取所有直接子节点。这里我们又加了限定条件,选取href属性为link1.html的a节点。
# #第五次选择时,我们调用了descendant轴,可以获取所有子孙节点。这里我们又加了限定条件获取span节点,所以返回的结果只包含span节点而不包含a节点。
# #第六次选择时,我们调用了following轴,可以获取当前节点之后的所有节点。这里我们虽然使用的是*匹配,
# # 但又加了索引选择,所以只获取了第二个后续节点。
# #第七次选择时,我们调用了following-sibling轴,可以获取当前节点之后的所有同级节点。
# # 这里我们使用*匹配,所以获取了所有后续同级节点。
# #以上是XPath轴的简单用法,更多轴的用法可以参考:http://www.w3school.com.cn/xpath/xpath_axes.asp
# html = """
# <html>
# <head>
# <title>The Dormouse's story</title>
# </head>
# <body>
# <p class="story">
# Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">
# <span>Elsie</span>
# </a>
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
# and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
# and they lived at the bottom of a well.
# </p>
# <p class="story">...</p>
# """
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# print(soup.p.descendants)
# for i, child in enumerate(soup.p.descendants):
# print(i, child)
# html = """
# <html>
# <head>
# <title>The Dormouse's story</title>
# </head>
# <body>
# <p class="story">
# Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">
# <span>Elsie</span>
# </a>
# </p>
# <p class="story">...</p>
# """
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# print(soup.span.parents)
#
# print(list(enumerate(soup.span.parents)))
# html = """
# <html>
# <body>
# <p class="story">
# Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">
# <span>Elsie</span>
# </a>
# Hello
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
# and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
# and they lived at the bottom of a well.
# </p>
# """
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# print('Next Sibling', soup.a.next_sibling)
# print('Prev Sibling', soup.a.previous_sibling)
# print('Next Siblings', list(enumerate(soup.a.next_siblings)))
# print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))
#
# print('Next Siblings', list(enumerate(soup.a.next_siblings)))
# print('Next Siblings', list(soup.a.next_siblings))
# html = """
# <html>
# <body>
# <p class="story">
# Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">Bob</a>
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
# </p>
# """
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# print('Next Sibling:')
# print(type(soup.a.next_sibling))
# print(soup.a.next_sibling)
# print(soup.a.next_sibling.string)
# print('Parent:')
# print(type(soup.a.parents))
# print(list(soup.a.parents)[0])
# print(list(soup.a.parents)[0].attrs['class'])
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# # print(soup.find_all(name='ul'))
# # print(type(soup.find_all(name='ul')[0]))
#
# for ul in soup.find_all(name='ul'):
# print(ul.find_all(name='li'))
# for li in ul.find_all(name='li'):
# print(li.string)
# html='''
# <div class="panel">
# <div class="panel-heading">
# <h4>Hello</h4>
# </div>
# <div class="panel-body">
# <ul class="list" id="list-1">
# <li class="element">Foo</li>
# <li class="element">Bar</li>
# <li class="element">Jay</li>
# </ul>
# <ul class="list list-small" id="list-2">
# <li class="element">Foo</li>
# <li class="element">Bar</li>
# </ul>
# </div>
# </div>
# '''
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# print(soup.find_all(id='list-1'))
# print(soup.find_all(class_='element'))
#
# import re
# html='''
# <div class="panel">
# <div class="panel-body">
# <a>Hello, this is a link</a>
# <a>Hello, this is a link, too</a>
# </div>
# </div>
# '''
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# print(soup.find(text=re.compile('link')))
# html='''
# <div class="panel">
# <div class="panel-heading">
# <h4>Hello</h4>
# </div>
# <div class="panel-body">
# <ul class="list" id="list-1">
# <li class="element">Foo</li>
# <li class="element">Bar</li>
# <li class="element">Jay</li>
# </ul>
# <ul class="list list-small" id="list-2">
# <li class="element">Foo</li>
# <li class="element">Bar</li>
# </ul>
# </div>
# </div>
# '''
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html, 'lxml')
# for li in soup.select('li'):
# print('Get Text:', li.get_text())
# print('String:', li.string)
# html = '''
# <div id="container">
# <ul class="list">
# <li class="item-0">first item</li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# '''
#
# from pyquery import PyQuery as pq
# doc = pq(html)
# items = doc('.list')
# print(type(items))
# print(items)
# lis = items.find('li')
# print(type(lis))
# print(lis)
# html = '''
# <div class="wrap">
# <div id="container">
# <ul class="list">
# <li class="item-0">first item</li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# </div>
# '''
# from pyquery import PyQuery as pq
# doc = pq(html)
# li = doc('li:first-child')
# print(li)
# li = doc('li:last-child')
# print(li)
# li = doc('li:nth-child(2)')
# print(li)
# li = doc('li:gt(2)')
# print(li)
# li = doc('li:nth-child(2n)')
# print(li)
# import json
# numbers = [2,3,5,7,9,12,13]
# filename = 'numbers.json'
# with open(filename,'w') as f_obj:
# json.dump(numbers,f_obj)
#
# import json
# with open('numbers.json') as f_obj:
# contents = json.load(f_obj)
# print(contents)
# import json
# #
# # data = [{
# # 'name': '王伟',
# # 'gender': '男',
# # 'birthday': '1992-10-18'
# # },
# # {
# # 'name': '梁新斌',
# # 'gender': '男',
# # 'birthday': '1992-03-24'
# # }
# # ]
# # with open('data.json', 'w',encoding='utf-8') as file:
# # #利用dumps()方法,我们可以将JSON对象转为字符串,然后再调用文件的write()方法写入文本
# # #file.write(json.dumps(data))
# # #如果想保存JSON的格式,可以再加一个参数indent,代表缩进字符个数
# # #file.write(json.dumps(data,indent=4))
# # #如果JSON中包含中文字符,为了输出中文,需要指定参数ensure_ascii为False,另外还要规定文件输出的编码
# # file.write(json.dumps(data, indent=4,ensure_ascii=False))
# import csv
# #首先,打开data.csv文件,然后指定打开的模式为w(即写入),获得文件句柄,随后调用csv库的writer()方法初始化写入对象,
# # 传入该句柄,然后调用writerow()方法传入每行的数据即可完成写入
# #运行结束后,直接用txt即可打开。
# #可以看到,写入的文本默认以逗号分隔,调用一次writerow()方法即可写入一行数据,文件也可以使用excel打开。
# #如果想修改列与列之间的分隔符,可以传入delimiter参数
# with open('data.csv', 'w') as csvfile:
# writer = csv.writer(csvfile, delimiter='|')
# #writer = csv.writer(csvfile)
# writer.writerow(['id', 'name', 'age'])
# writer.writerow(['10001', 'Mike', 20])
# writer.writerow(['10002', 'Bob', 22])
# writer.writerow(['10003', 'Jordan', 21])
# #我们也可以调用writerows()方法同时写入多行,此时参数就需要为二维列表
# with open('data.csv', 'w') as csvfile:
# writer = csv.writer(csvfile, delimiter='^')
# writer.writerow(['id', 'name', 'age'])
# writer.writerows([['10001', 'Mike', 20], ['10002', 'Bob', 22], ['10003', 'Jordan', 21]])
# #但是一般情况下,爬虫爬取的都是结构化数据,我们一般会用字典来表示。在csv库中也提供了字典的写入方式
# with open('data.csv', 'w',encoding='utf-8') as csvfile:
# #这里先定义3个字段,用fieldnames表示,然后将其传给DictWriter来初始化一个字典写入对象,
# # 接着可以调用writeheader()方法先写入头信息,然后再调用writerow()方法传入相应字典即可
# fieldnames = ['id', 'name', 'age']
# writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# writer.writeheader()
# writer.writerow({'id': '10001', 'name': '梁新斌', 'age': 20})
# writer.writerow({'id': '10002', 'name': '杨丽颖', 'age': 22})
# writer.writerow({'id': '10003', 'name': 'Jordan', 'age': 21})
# import pymysql
# import tool
# db_conn = tool.get_connect()
# cursor = tool.get_cursor(db_conn)
# data = {
# 'id': '20120001',
# 'name': 'Bob',
# 'age': 20
# }
# table = 'students'
# keys = ', '.join(data.keys())
# print(data.keys())
# print(keys)
# values = ', '.join(['%s'] * len(data))
# print(values)
# sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
# print(sql)
# print(tuple(data.values()))
# try:
# if cursor.execute(sql, tuple(data.values())):
# print('Successful')
# db_conn.commit()
# except :
# print('Failed')
# db_conn.rollback()
# db_conn.close()
# import pymysql
# import tool
# db = tool.get_connect()
# cursor = tool.get_cursor(db)
# table = 'students'
# condition = 'age > 24'
#
# sql = 'DELETE FROM {table} WHERE {condition}'.format(table=table, condition=condition)
# try:
# cursor.execute(sql)
# db.commit()
# except:
# db.rollback()
#
# db.close()
# seq1 = ['hello','good','boy','doiido']
# print ('|'.join(seq1))
# seq4 = {'hello':1,'good':2,'boy':3,'doiido':4}
# print ('|'.join(seq4))
#
# a = ['日期', '天气状况', '气温', '风力风向', '2011年01月01日', '晴/晴', '0℃/-9℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月02日', '多云/阴', '-2℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月03日', '晴/晴', '1℃/-8℃', '北风3-4级/无持续风向≤3级', '2011年01月04日', '晴/晴', '-1℃/-11℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月05日', '晴/晴', '-1℃/-8℃', '北风4-5级/北风3-4级', '2011年01月06日', '晴/晴', '0℃/-10℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月07日', '晴/多云', '1℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月08日', '多云/晴', '1℃/-8℃', '北风4-5级/北风4-5级', '2011年01月09日', '晴/晴', '-1℃/-10℃', '北风3-4级/无持续风向≤3级', '2011年01月10日', '晴/多云', '-1℃/-7℃', '无持续风向≤3级/北风3-4级', '2011年01月11日', '晴/晴', '-1℃/-11℃', '北风3-4级/无持续风向≤3级', '2011年01月12日', '多云/多云', '0℃/-8℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月13日', '晴/晴', '1℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月14日', '晴/晴', '-1℃/-10℃', '北风4-5级/北风3-4级', '2011年01月15日', '晴/阴', '3℃/-4℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月16日', '晴/晴', '-1℃/-9℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月17日', '晴/晴', '-1℃/-9℃', '北风3-4级/无持续风向≤3级', '2011年01月18日', '晴/晴', '-1℃/-10℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月19日', '晴/晴', '0℃/-9℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月20日', '晴/多云', '0℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月21日', '晴/多云', '1℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月22日', '多云/多云', '0℃/-9℃', '无持续风向≤3级/北风3-4级', '2011年01月23日', '晴/晴', '-1℃/-10℃', '北风3-4级/无持续风向≤3级', '2011年01月24日', '晴/多云', '0℃/-9℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月25日', '晴/晴', '0℃/-10℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月26日', '晴/多云', '0℃/-8℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月27日', '晴/晴', '2℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月28日', '晴/晴', '4℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月29日', '晴/晴', '-1℃/-10℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月30日', '晴/晴', '3℃/-7℃', '无持续风向≤3级/无持续风向≤3级', '2011年01月31日', '多云/雾', '4℃/-2℃', '无持续风向≤3级/无持续风向≤3级']
#
# n = 4
# c= [a[i:i+n] for i in range(0, len(a), n)]
# print(c)
# a= ['2011年12月01日', '晴/阴', '4℃/-2℃', '无持续风向≤3级/无持续风向≤3级']
# a2 = a[2].split('/')
# a.remove(a[2])
# a.insert(2,a2[0])
# a.insert(3,a2[1])
# print(a2)
# print(a)
# import pymysql
#
#
# dbconn = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='test')
# db_cur = dbconn.cursor()
# sql = "select GROUP_CONCAT(COLUMN_name,'') from information_schema.COLUMNS where table_name = %s ORDER BY ordinal_position "
# db_cur.execute(sql,'weather')
# tup = db_cur.fetchone()
# print(tup[0])
import pymysql
import tool
from lxml import etree
import requests
def parse_html(db_conn,db_cur,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
# url = 'http://www.tianqihoubao.com/lishi/beijing/month/201112.html'
#获取城市id
id = get_cityid(db_conn,db_cur,url)
#获取天气数据的html网页源代码
weather_data = requests.get(url=url,headers=headers).text
weather_html = etree.HTML(weather_data)
table = weather_html.xpath('//table//tr//td//text()')
#获取所有日期相关的数据,存储在列表中
list = []
for t in table:
if t.strip() == '':
continue
#替换元素中的空格和\r\n
t1 = t.replace(' ','')
t2 = t1.replace('\r\n','')
list.append(t2.strip())
#对提取到的列表数据进行拆分,将一个月的天气数据拆分成每天的天气情况,方便数据插入数据库
n = 4
sublist= [list[i:i+n] for i in range(0, len(list), n)]
#删除表头第一行
sublist.remove(sublist[0])
flist = []
#将列表元素中的最高和最低气温拆分,方便后续数据分析,并插入城市代码
for sub in sublist:
if sub == sublist[0]:
pass
sub2 = sub[2].split('/')
sub.remove(sub[2])
sub.insert(2, sub2[0])
sub.insert(3, sub2[1])
sub.insert(0,id) #插入城市代码
flist.append(sub)
db_conn.close()
def get_cityid(db_conn,db_cur,url):
suburl = url.split('/')
sql = 'select cityid from city where cityname = %s '
db_cur.execute(sql,suburl[4])
cityid = db_cur.fetchone()
idlist = list(cityid)
return idlist[0]
def parse_html_bs_wy(db_conn,db_cur,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
# 获取拉勾网html网页源代码
python_data = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(python_data,'lxml')
position_content = soup.find_all(attrs={'class','company'})
for pos in position_content:
print(pos.string)
position_content = soup.find_all(attrs={'class', 'name'})
for pos in position_content:
print(pos.string)
position_content = soup.find_all(attrs={'class', 'job_request'})
for pos in position_content:
for p1 in pos.find_all(name='span'):
print(p1.string)
for p2 in pos.find_all(name = 'li'):
print(p2.string)
position_content = soup.find_all(attrs={'class','job-advantage'})
for pos in position_content:
for p1 in pos.find(attrs={'class','advantage'}):
print(p1.string)
for p2 in pos.find(name = 'p'):
print(p2.string)
def parse_html_py(db_conn,db_cur,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
# 获取拉勾网html网页源代码
weather_data = requests.get(url=url, headers=headers).text
doc = pq(weather_data)
table = doc('#content.wdetail')
td = table.find('td')
s = td.text().strip()
slist = s.split(' ')
print(slist)
def get_url():
url_1 = 'https://www.lagou.com/zhaopin/Python/'
url_2 = '/?filterOption=3'
urllist = []
for i in range(2, 31):
url = url_1 + str(i) + url_2
urllist.append(url)
return urllist
def get_div():
# 如果是网址,可以用这个办法来读取网页
# html_doc = "http://tieba.baidu.com/p/2460150866"
# req = urllib.request.Request(html_doc)
# webpage = urllib.request.urlopen(req)
# html = webpage.read()
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="xiaodeng"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<a href="http://example.com/lacie" class="sister" id="xiaodeng">Lacie</a>
and they lived at the bottom of a well.</p>
<div class="ntopbar_loading"><img src="http://simg.sinajs.cn/blog7style/images/common/loading.gif">加载中…</div>
<div class="SG_connHead">
<span class="title" comp_title="个人资料">个人资料</span>
<span class="edit">
</span>
<div class="info_list">
<ul class="info_list1">
<li><span class="SG_txtc">博客等级:</span><span id="comp_901_grade"><img src="http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif" real_src="http://simg.sinajs.cn/blog7style/images/common/number/9.gif" /></span></li>
<li><span class="SG_txtc">博客积分:</span><span id="comp_901_score"><strong>0</strong></span></li>
</ul>
<ul class="info_list2">
<li><span class="SG_txtc">博客访问:</span><span id="comp_901_pv"><strong>3,971</strong></span></li>
<li><span class="SG_txtc">关注人气:</span><span id="comp_901_attention"><strong>0</strong></span></li>
<li><span class="SG_txtc">获赠金笔:</span><strong id="comp_901_d_goldpen">0支</strong></li>
<li><span class="SG_txtc">赠出金笔:</span><strong id="comp_901_r_goldpen">0支</strong></li>
<li class="lisp" id="comp_901_badge"><span class="SG_txtc">荣誉徽章:</span></li>
</ul>
</div>
<div class="atcTit_more"><span class="SG_more"><a href="http://blog.sina.com.cn/" target="_blank">更多>></a></span></div>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'html.parser') # 文档对象
# 类名为xxx而且文本内容为hahaha的div
for k in soup.find_all('div', class_='info_list'): # ,string='更多'
print(k)
# <div class="atcTit_more"><span class="SG_more"><a href="http://blog.sina.com.cn/" target="_blank">更多>></a></span></div>
def get_movie(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
# 获取猫眼电影网页源代码
maoyan_data = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(maoyan_data,'lxml')
movies = soup.find_all(attrs={'class','board-item-main'})
mlist = []
for movie in movies:
for m in movie.find_all(name='p'):
if m.string == None:
continue
mlist.append(m.string.strip())
sublist = [mlist[i:i+3] for i in range(0,len(mlist),3)]
print(sublist)
def others():
groups = ([x * 20 for x in range(1, 21)])
print(groups)
def get_page(page):
params = {
'offset' : page,
'format' : 'json',
'keyword' : '街拍',
'autoload' : 'true',
'count' : '20',
'cur_tab' : '1',
'from' : 'search_tab',
'pd' : 'synthesis',
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
print(url)
# response = requests.get(url = url,headers = headers)
# try:
# response = requests.get(url)
# if response.status_code == 200:
# return response.json()
# except requests.ConnectionError:
# return None
def pict_to_str():
price = '''
<script>
var offset_unit = 30;
if(window.devicePixelRatio >1){
$('body').addClass('ratio2');
}
var ROOM_PRICE = {"image":"//static8.ziroom.com/phoenix/pc/images/price/2e5de016645b8deee948bceb5932e4c5s.png","offset":[[5,7,1,4],[8,9,6,4],[8,1,1,4],[8,7,5,4],[5,0,1,4],[8,4,5,4],[8,3,6,4],[5,1,1,4],[5,1,1,4],[8,4,6,4],[5,8,6,4],[5,9,6,4],[5,6,5,4],[8,3,6,4],[8,5,5,4],[5,9,5,4],[5,6,6,4],[5,2,4,4]]};
$('#houseList p.price').each(function(i){
var dom = $(this);
if(!ROOM_PRICE['offset'] || !ROOM_PRICE['offset'][i]) return ;
var pos = ROOM_PRICE['offset'][i];
for(i in pos){
var inx = pos.length -i -1;
var seg = $('<span>', {'style':'background-position:-'+(pos[inx]*offset_unit)+'px', 'class':'num'});
dom.prepend(seg);
}
var seg = $('<span>', {'style':'background-position:1000px', 'class':'num rmb'}).html('¥');
dom.prepend(seg);
});
</script>
'''
p = re.findall('{"image":"//.*]]}',price,re.S)
for p1 in p :
dictinfo = json.loads(p1)
return dictinfo
# print('https://' + dictinfo['image'])
# print(dictinfo['offset'])
def parse_price(offset):
s = '8652039147'
s1 = ''
for off in offset:
s1 = s1 + s[off]
print(s1)
def test():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# print(soup.find_all(name='ul'))
print(type(soup.find_all(name='ul'))) # 返回的类型是<class 'bs4.element.ResultSet'>,这里是字典外套了一个列表
print(soup.find_all(name='ul'))
# 将png的透明背景色变成白色
def background_turn_white(image_name):
""" 将png的透明背景色变成白色 """
im = Image.open(image_name)
x, y = im.size
# 使用白色来填充背景
p = Image.new('RGBA', im.size, (255, 255, 255))
p.paste(im, (0, 0, x, y), im)
white_picture_name = 'white' + image_name
p.save(white_picture_name)
return white_picture_name
def pict_to_string(image_name):
image = Image.open(image_name)
text = pytesseract.image_to_string(image) #, lang='chi_sim' lang='eng'
print(text)
if __name__ == '__main__':
image_name = 'acb.png'
# white_picture_name = background_turn_white(image_name)
pict_to_string(image_name)
# dictinfo = pict_to_str()
# img_url = 'https://' + dictinfo['image']
# offsets = dictinfo['offset']
# for offset in offsets:
# parse_price(offset)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。