代码拉取完成,页面将自动刷新
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pytesseract
from PIL import Image
cookies = {
'session': '.eJyrViotTi1SsqpWyiyOT0zJzcxTsjLQUcrJTwexSopKU3WUcvOTMnNSlayUDM3gQEkHrDE-M0XJyhjCzkvMBSmKKTVNMjMDkiamFkq1tQDfeR3n.YLOC4w.Xbnx1QbrvUh8OUPb5jauC_Aau9U',
}
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_16_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
root_url = 'http://47.103.13.124:8001/'
url = urljoin(root_url, 'phone_picture')
response = requests.get(url, headers=headers, cookies=cookies, verify=False)
soup = BeautifulSoup(response.text, 'lxml')
imgs = soup.find('tbody').find_all('img')
urls = [img.attrs.get('src') for img in imgs]
for url in urls:
# 获得完整的url
url = urljoin(root_url, url)
img = requests.get(url, stream=True).raw
img = Image.open(img)
res = pytesseract.image_to_string(img)
phone_number = re.match('\d*', res).group()
print(f'URL: {url}, OCR Result: {res}, PhoneNumber: {phone_number}')
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。