代码拉取完成,页面将自动刷新
import pymysql
import matplotlib.pyplot as plt
HOST = "localhost"
USER = "root"
PASSWORD = "123456"
DATABASE = "51job"
PORT = 3306,
CHARSET = "utf8"
TABLE_NAME = "51job_table"
SAVEFILE_PATH = "../data/"
list_x = []
list_y = []
connection = pymysql.Connect(host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=3306,
charset=CHARSET)
cur = connection.cursor()
def matplot_chinese():
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False
HEADER = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '_uab_collina=164083188927394862921656; partner=www_baidu_com; privacy=1640831873; guid=3ec9968fb69da6c4c985b99a92fb1e6c; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; acw_tc=2f624a5116408410489218571e1f29773fdfcbc25d684039598ac10fdef5aa; acw_sc__v2=61cd3f8c7ff55fd2001524e1730e634059bd99cd; acw_sc__v2=61cd3f8c7ff55fd2001524e1730e634059bd99cd; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C8%CB%B9%A4%D6%C7%C4%DC%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%600%7C%21; arp_scroll_position=94; ssxmod_itna=Qq0x0Dci0=YmuRDl4iqYKEP7qciUFhDR7rpO7Qmx0vP+eGzDAxn40iDtorT1Yo8ie4NdF=Wppx8nj8hK46WRWw0aKjFr4GLDmKDyKYb4xoD445GwD0eG+DD4DWiqB6gDtqDkD3qDdUx7PDch5DLxi7zgIPDB+Yd8iCDQPDy2PDIxD1r3whDiIzDYvkDimzD75HDQIzbKYD+xTAuxi32ODDBh43970Aj=Mj/rgp8Rgv+9oACO8AnG0wNtM3n1yzUxibzpS+2r1zS=MNWj+YIQhYmhiWFiGq1AmohQGxdEroIGtPqjDiCeXeYr4KDDiOxqGxRjDxD=; ssxmod_itna2=Qq0x0Dci0=YmuRDl4iqYKEP7qciUFhDR7rpO7QDnI8YhKDsNeDLehiL=G7Y4qnRGCnjjDYP8YyDWA0pVjiuDrq52PnIAx0FiMvq+2+/Te=456DUe=edUA2BWAeEDTzn19obLGXEBPqEEA0xdBxKh/roeGEQPoSQ52EYeZfIpmmYwDqfRAlfi4qK=jWT7fxfVQ3WAY1E=lbeN=+m7P+eerp4mmPIcGSbsjDar/kdqZ3=YZPoAIl7WaKy1+1bK5Y28dx2Id9WrS43vtqh2lUBKkUddlOG0OHWria=8r1Z7KAOd/Mj4bt3qhYgCE/=IdrL2B3Kg6si3nEj56=mPrK23pCI0=d7n=+C676je=U=SPzaPureVBPHP5a83gBm7Y5=t5jlm2DiFbKr158YNZQvhgdE0I7WIEGdwbedYnVGXhboxOfi6RT=mD4dcCIZ2KEtpooKX3f+0a36IoOmcOfo6jUBE3zWTp9pnP7Fa=feTgfl9TTomVBrCtb/2ntFExvnnnAmo9PscK5uKvTD4ggYNdA4DQFoKDMlaTTbNAqvP9mx1OF0by8/CpMb4PiiX4cfobLhwvTbgA1=C5xn8Wz=DZ5qUZ/YiODH=nGYGTglPDZ02Hi=GDDFqD+aPqAx5G+lm+975mMleBC=YIPYD',
'Host': 'search.51job.com',
'Referer': 'https://search.51job.com/list/000000,000000,0000,00,9,99,%E5%A4%A7%E6%95%B0%E6%8D%AE,2,1.html?',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Microsoft Edge";v="96"',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62',
}
def format_url(page):
# URL_STYLE = "https://search.51job.com/list/000000,000000,0000,26,9,99,+,2,{}.html".format(page)
URL_STYLE = ["https://search.51job.com/list/000000,000000,0000,00,9,99,大数据,2,{}.html?".format(page),
"https://search.51job.com/list/000000,000000,0000,00,9,99,人工智能,2,{}.html?".format(page),
]
return URL_STYLE
def format_write(i):
format_string = "" \
"{job_name}\t{company_name}\t{providesalary_text}\t{workarea_text}\t" \
"{updatedate}\t{companytype_text}\t{degreefrom}\t{workyear}\t" \
"{issuedate}\t{jobwelf}\t{attribute_text}\t" \
"{companysize_text}\t{companyind_text}{newline}" \
.format(
job_name=i["job_name"], company_name=i["company_name"],
providesalary_text=i["providesalary_text"],
workarea_text=i["workarea_text"], updatedate=i["updatedate"],
companytype_text=i["companytype_text"],
degreefrom=i["degreefrom"], workyear=i["workyear"],
issuedate=i["issuedate"], jobwelf=i["jobwelf"],
attribute_text=" ".join(i["attribute_text"]),
companysize_text=i["companysize_text"],
companyind_text="not_found",
newline="\n")
return format_string
CREATE_TABLE_51JOB = """
create table if not exists 51job_table(
job_name varchar(100), company_name varchar(100),
providesalary_text varchar(100), workarea_text varchar(100),
updatedate varchar(100), companytype_text varchar(100),
degreefrom varchar(50), workyear varchar(50),
issuedate varchar(50), jobwelf varchar(100),
attribute_text varchar(100), companysize_text varchar(50),
companyind_text varchar(50)
)
"""
CLEAR_TABLE = """
truncate table 51job_table
"""
COMPANYTYPE_TEXT = """
select substring_index(companytype_text, '-', 1) companytype, count(*) companytype_count
from 51job_table a
group by companytype
order by companytype_count desc
"""
HEAD_TOP_COMPANY = """
select distinct company_name, count(company_name) company_count
from 51job_table
group by company_name
order by company_count desc
limit 10;
"""
JOBWELF_COUNT = """
select count(*) from 51job_table where jobwelf != ''
"""
WORKAREA_SORT = """
select substring_index(workarea_text, '-', 1) workarea_province, count(*) workarea_count
from 51job_table a
group by workarea_province
order by workarea_count desc
limit 20;
"""
def format_insert(i):
INSERT_VALUES = "" \
"('{job_name}', '{company_name}', '{providesalary_text}', '{workarea_text}', " \
"'{updatedate}', '{companytype_text}', '{degreefrom}', '{workyear}', " \
"'{issuedate}', '{jobwelf}', '{attribute_text}', " \
"'{companysize_text}', '{companyind_text}') " \
"".format(
job_name=i[0].strip("\\"), company_name=i[1].replace("\'", ""),
providesalary_text=i[2],
workarea_text=i[3], updatedate=i[4],
companytype_text=i[5],
degreefrom=i[6], workyear=i[7],
issuedate=i[8], jobwelf=i[9],
attribute_text=" ".join(i[10]),
companysize_text=i[11],
companyind_text=i[12].strip()
)
return INSERT_VALUES
DEGREEFROM_WORKYEAR_COMPANYSIZE_TEXT = """
select degreefrom, workyear, companysize_text
from 51job_table
where degreefrom !="" and workyear != "" and companysize_text != "" and companytype_text != "";
"""
ANALYSIS_COMPANYTYPE_TEXT = """
select companytype_text
from 51job_table
where degreefrom !="" and workyear != "" and companysize_text != "" and companytype_text != "";
"""
########################################################################################################################
# new
SALARY_WORKTIME="""
select providesalary_text,workarea_text from 51job_table limit 10;
"""
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。