代码拉取完成,页面将自动刷新
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 18/4/15 上午2:07
# @Author : 重剑无锋
# @Email : TideSecPlus@gmail.com
import random,os,mysql.connector
import urllib2,re,requests
import time,hashlib,urllib
import sys
mysql_config = {
'host': '127.0.0.1',
# 'host': '192.168.1.201',
'port': 3306,
'db_name': 'finger',
'username': 'root',
'password': '123456'
}
def url_protocol(url):
domain = re.findall(r'.*(?=://)', url)
if domain:
return domain[0]
else:
return url
def get_domain(target):
try:
url = target
if url[0:4] == 'http':
proto, rest = urllib.splittype(url)
host, rest = urllib.splithost(rest)
if host[0:3] == 'www':
host = host[4:]
elif url[0:3] == 'www':
host = url[4:]
else:
host = url
if ':' in host:
host = host.split(':')[0]
if '/' in host:
host = host.split('/')[0]
return host
except:
return target
def get_main_domain(domain):
double_exts = ['.com.cn','.edu.cn','.gov.cn','.org.cn','.net.cn']
main_domain = domain
for ext in double_exts:
if ext in domain:
if len(domain.split('.')) > 3:
# print "yuanshi",domain
domain_split = domain.split('.')
domain_new = "%s.%s.%s" % (domain_split[-3], domain_split[-2], domain_split[-1])
# print "exact",domain
main_domain = domain_new
else:
main_domain = domain
break
else:
if len(domain.split('.')) > 2:
domain_split = domain.split('.')
domain_new = "%s.%s" % (domain_split[-2], domain_split[-1])
main_domain = domain_new
else:
main_domain = domain
return main_domain
def requests_headers():
'''
Random UA for every requests && Use cookie to scan
'''
user_agent = ['Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1 ; x64; en-US; rv:1.9.1b2pre) Gecko/20081026 Firefox/3.1b2pre',
'Opera/10.60 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.60','Opera/8.01 (J2ME/MIDP; Opera Mini/2.0.4062; en; U; ssr)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ; rv:1.9.0.14) Gecko/2009082707 Firefox/3.0.14',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; fr; rv:1.9.2.4) Gecko/20100523 Firefox/3.6.4 ( .NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5']
UA = random.choice(user_agent)
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent':UA,'Upgrade-Insecure-Requests':'1','Connection':'keep-alive','Cache-Control':'max-age=0',
'Accept-Encoding':'gzip, deflate, sdch','Accept-Language':'zh-CN,zh;q=0.8',
"Referer": "http://www.baidu.com/link?url=www.so.com&url=www.soso.com&&url=www.sogou.com"}
return headers
excludeext = ['.png', '.ico', '.gif','.svg', '.jpeg','js','css','xml','txt']
def getPageLinks(url):
try:
headers = requests_headers()
content = requests.get(url, timeout=5, headers=headers, verify=False).text.encode('utf-8')
links = []
tags = ['a', 'A', 'link', 'script', 'area', 'iframe', 'form'] # img
tos = ['href', 'src', 'action']
if url[-1:] == '/':
url = url[:-1]
try:
for tag in tags:
for to in tos:
link1 = re.findall(r'<%s.*?%s="(.*?)"' % (tag, to), str(content))
link2 = re.findall(r'<%s.*?%s=\'(.*?)\'' % (tag, to), str(content))
for i in link1:
links.append(i)
for i in link2:
if i not in links:
links.append(i)
except Exception, e:
print e
print '[!] Get link error'
pass
return links
except:
return []
def getMD5(c):
m = hashlib.md5()
m.update(c)
psw = m.hexdigest()
return psw
def request_url(url):
try:
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0'
}
r = requests.get(url=url, headers=requests_headers(),timeout=5,verify=False,)
r.encoding = 'utf-8'
if r.status_code==200:
return r.content
else:
return ''
except Exception,e:
# print e
return ''
def update_mysql(conn,url,filename,md5):
try:
cur=conn.cursor()
sql = "select * from file_md5 where md5 = '"+str(md5)+"'"
print sql
cur.execute(sql)
exist_md5 = cur.fetchone()
if exist_md5:
url = exist_md5[2]+','+url
hint = exist_md5[3]+1
sql = "update file_md5 set url = '%s',hint='%s' where md5 = '%s'" % (url,hint,md5)
else:
sql = "INSERT INTO file_md5 (url,filename,md5,hint) VALUES ('%s','%s','%s','%s')" % (url,filename,md5,1)
print sql
cur.execute(sql)
conn.commit()
except Exception,e:
print e
pass
conn=mysql.connector.connect(user=mysql_config['username'],password=mysql_config['password'],host=mysql_config['host'],database=mysql_config['db_name'],charset='utf8')
# url = 'http://www.sddlr.gov.cn/'
for url in open('url.txt','r').readlines():
url= url.strip()
print url
urlprotocol = url_protocol(url)
domain_url = get_domain(url)
pageLinks = getPageLinks(url)
print pageLinks
true_url =[]
for suburl in pageLinks:
for ext in excludeext:
if ext in suburl[-4:]:
# print ext,' ',suburl
if re.findall(r'/', suburl):
if re.findall(r':', suburl):
true_url.append(suburl)
else:
true_url.append(urlprotocol + '://' + domain_url + '/' + suburl)
else:
true_url.append(urlprotocol + '://' + domain_url + '/' + suburl)
url_tmp1 = set(list(true_url))
print "原长度: ",len(url_tmp1)
url_tmp2 = []
for x in url_tmp1:
url_tmp2.append(x)
print "去重后: ",len(url_tmp2)
for url_tmp in url_tmp2:
url_content = request_url(url_tmp)
filename = os.path.basename(url_tmp)
md5 =getMD5(url_content)
print url_tmp,' ',filename,md5
update_mysql(conn,url_tmp,filename,md5)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。