From 94511870d6365ea3f2596096d5f2807dc078592c Mon Sep 17 00:00:00 2001 From: qinxiao <1872826298@163.com> Date: Wed, 25 Oct 2023 20:08:02 +0800 Subject: [PATCH 1/4] First Push -add a file,oil_web_list.json -add OilPriceResponseFileSystemManager in _datamanager.py -add OilCrawler in _crawler I have to say that so many thing is wrong --- modules/_crawler.py | 143 +++++++++++++++++++++++++++++++++++++--- modules/_datamanager.py | 19 +++++- oil_web_list.json | 13 ++++ 3 files changed, 166 insertions(+), 9 deletions(-) create mode 100644 oil_web_list.json diff --git a/modules/_crawler.py b/modules/_crawler.py index 1d06513..0831e0b 100644 --- a/modules/_crawler.py +++ b/modules/_crawler.py @@ -4,12 +4,10 @@ from config import Config from ._datapackage import * from abc import ABC, abstractmethod import pandas as pd -import requests +import requests,json,os,time +from time import sleep from loguru import logger -import json -import time - - +from lxml import etree ''' 爬虫类 ''' @@ -226,14 +224,143 @@ class OilCrawler(CrawlerInterface): self.url = Config.CrawlerConfig.OilCrawlerConfig.URL # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 # self.try_again_seconds = ??? # 请指定请求失败后的重新请求时间, 该属性必须填写 - + def request_and_process_data(self): + + #爬取函数,调用json_process,返回内容为一个列表,第一个值是爬取内容(文本),第二个值是状态码 + def climb(): + web_url=json_process(0) + if web_url == "http://youjia.10260.com/chaiyou/": + response = requests.get(web_url) + code= response.status_code + print("爬取成功") + return [response.text,code] + elif web_url == "https://www.cngold.org/crude/chaiyou.html": + print("Error") + return(None) + elif web_url == "https://www.5waihui.com/oil/cn/": + print("Error") + return(None) + elif web_url == "https://www.05348.com/": + print("Error") + return(None) + elif web_url == "http://gas.humeup.cn/0haochaiyou/": + print("Error") + return(None) + else: + print("Error") + return(None) + + #json文件处理,返回一个URL + def json_process(process_choice): + + process=["oil_web_list","readme","position_name"] + + #打开json库 + with open(f'{os.getcwd()}\\oil_web_list.json', 'r',encoding = "utf-8") as pf: + txt=pf.read() + json_dic = json.loads(txt)[process[process_choice]] + pf.close() + + #调用oil_web_list: + if process_choice==0: + + web=["first","second","third","fourth","fivth"] + web_choice=0 + + #尝试连接网站 + while True: + response = requests.get(json_dic[web[web_choice]]) + code = response.status_code + if code==200: + web_url=json_dic[web[web_choice]] + print("Connect successfully:\t"+web_url) + break + else: + if web_choice<4: + web_choice+=1 + else: + web_choice=0 + return str(web_url) + + #调用readme: + if process_choice==1: + return json_dic + + #调用position_name: + if process_choice==2: + return json_dic + + time = datetime.datetime.now() + + #使用xpath解析网页元素,输出一个键为省名,值一个列表为字典 + def analyze_with_xpath(file): + with open(file,'r',encoding="utf-8") as pf: + r= pf.read() + tree = etree.HTML(r) + bows_values = tree.xpath('//td/text()') + print("bows_values:") + print(bows_values) + + #打包成组 + bows_values_packaged=[] + for i in range(int(len(bows_values)/5)):#组的序号 + package=[bows_values[i*5],bows_values[i*5+1],bows_values[i*5+2],bows_values[i*5+3],bows_values[i*5+4]] + bows_values_packaged.append(package) + print("bows_values_packaged:") + print(bows_values_packaged) + + #将json文件当中“position_name”的键名转化成bows_keys列表的值 + bows={} + bows_keys=list(OilCrawler.json_process(2)) + for i in range(3): + bows_keys.pop() + i=0#打包的序号 + for bow_key in bows_keys: + bows[bow_key]=bows_values_packaged[i] + i+=1 + print('解析已完成') + + return bows + + + #循环部分,在下一行设置具体时间 + if time.strftime("%H:%M:%S")=="11:00:00": + + #创建html文本文件并保存 + file_name=f"{time.strftime('%Y-%m-%d')}_oil_price" + f=open(f"{file_name}.html","w+",encoding="utf-8") + f.write(climb()[0]) + + #html文本文件出现空行问题,重新读取一遍数据删除空行 + _f=open(f"_{file_name}.html","w+",encoding="utf-8") + f.seek(0) + rl=f.readlines() + for i in rl: + if i != "\n" : + _f.write(i) + + #输出时间 + print(time) + + f.close() + _f.close() + + data=analyze_with_xpath(f"_{file_name}.html") + print(data) + + #将数据写入json文件 + with open(f"{file_name}.json","w",encoding="utf-8") as _f: + json.dump(data,_f) + + sleep(60)#防止重复爬取 + is_request_success = True # 请求是否成功? # 请求数据 # ... # 得到的数据以字符串存储 - text = "1234567" - + text = str(json.dump(data,_f)) + if not is_request_success: # 请求失败则返回None return None diff --git a/modules/_datamanager.py b/modules/_datamanager.py index fbe0ce4..defccf5 100644 --- a/modules/_datamanager.py +++ b/modules/_datamanager.py @@ -6,6 +6,9 @@ from abc import ABC, abstractmethod import pandas as pd from loguru import logger + + + ''' 数据管理器类 ''' @@ -175,7 +178,7 @@ class DailyWeatherResponseFileSystemManager(ResponseFileSystemManager): class ExtremeWeatherResponseFileSystemManager(ResponseFileSystemManager): ''' - 日常天气响应存到文件系统 + 极限天气响应存到文件系统 ''' def __init__(self): super().__init__() @@ -183,3 +186,17 @@ class ExtremeWeatherResponseFileSystemManager(ResponseFileSystemManager): Config.DataPackageConfig.EXTREME_WEATHER_RESPONSE_MERGE_TAG: self.merge_to_csv, } self.response_file_path = Config.DataManagerConfig.FileSystemManagerConfig.ResponseFileSystemManagerConfig.ExtremeWeatherResponseFileSystemManagerConfig.RESPONSE_FILE_PATH + +class OilPriceResponseFileSystemManager(ResponseFileSystemManager): + ''' + 油价响应存到文件系统 + ''' + + def __init__(self): + super().__init__() + self._callback_func_dict = { + Config.DataPackageConfig.EXTREME_WEATHER_RESPONSE_MERGE_TAG: self.merge_to_csv, + } + self.response_file_path = Config.DataManagerConfig.FileSystemManagerConfig.ResponseFileSystemManagerConfig.ExtremeWeatherResponseFileSystemManagerConfig.RESPONSE_FILE_PATH + + \ No newline at end of file diff --git a/oil_web_list.json b/oil_web_list.json new file mode 100644 index 0000000..9773549 --- /dev/null +++ b/oil_web_list.json @@ -0,0 +1,13 @@ +{ + "oil_web_list":{ + "first":"http://youjia.10260.com/chaiyou/", + "second":"https://www.cngold.org/crude/chaiyou.html", + "third":"https://www.5waihui.com/oil/cn/", + "fourth":"https://www.05348.com/", + "fivth":"http://gas.humeup.cn/0haochaiyou/" + }, + "readme":"存放了五个网站,在前一个网站不稳定时,将会切换到后一个网站", + "position_name":{ + "北京": "Beijin", "天津": "Tianjin", "河北": "Hebei", "山西": "Shanxi", "内蒙古": "NeiMenggu", "辽宁": "Liaoning", "吉林": "Jilin", "黑龙江": "Heilongjiang", "上海": "Shanghai", "江苏": "Jiangsu", "浙江": "Zhejiang", "安徽": "Anhui", "福建": "Fujian", "江西": "Jiangxi", "山东": "Shandong", "河南": "Henan", "湖北": "Hubei", "湖南": "Hunan", "广东": "Guangadong", "广西": "Guangxi", "海南": "Hainan", "重庆": "Chongqing", "四川": "Sichuan", "贵州": "Guizhou", "云南": "Yunnan", "西藏": "Xizang", "陕西": "Shaanxi", "甘肃": "Gansu", "青海": "Qinghai", "宁夏": "Ningxia", "新疆": "Xinjiang", "台湾": "Taiwan", "香港": "HongKong", "澳门": "Macau" + } +} \ No newline at end of file -- Gitee From 6b2dec817b5c8510fd9520626e8ea79557f13ffe Mon Sep 17 00:00:00 2001 From: qinxiao <1872826298@163.com> Date: Tue, 21 Nov 2023 18:39:08 +0800 Subject: [PATCH 2/4] fix --- modules/_crawler.py | 351 ++++++------------ oil_price/oil.py | 156 ++++++++ .../oil_web_list.json | 2 +- 3 files changed, 276 insertions(+), 233 deletions(-) create mode 100644 oil_price/oil.py rename oil_web_list.json => oil_price/oil_web_list.json (95%) diff --git a/modules/_crawler.py b/modules/_crawler.py index 0831e0b..54ef678 100644 --- a/modules/_crawler.py +++ b/modules/_crawler.py @@ -4,372 +4,259 @@ from config import Config from ._datapackage import * from abc import ABC, abstractmethod import pandas as pd -import requests,json,os,time +import requests, json, os, time from time import sleep from loguru import logger from lxml import etree -''' + +""" 爬虫类 -''' +""" class CrawlerInterface(DataPackageHandler): def __init__(self): super().__init__() - self._callback_func_dict: dict[str: callable] - self._input_columns_type_dict: list[dict[str: list[callable]]] = [Config.DataPackageConfig.NOT_INIT_COLUMNS, Config.CrawlerConfig.RESPONSE_COLUMNS] - self._output_columns_type_dict: list[dict[str: list[callable]]] = [Config.CrawlerConfig.RESPONSE_COLUMNS] + self._callback_func_dict: dict[str:callable] + self._input_columns_type_dict: list[dict[str : list[callable]]] = [ + Config.DataPackageConfig.NOT_INIT_COLUMNS, + Config.CrawlerConfig.RESPONSE_COLUMNS, + ] + self._output_columns_type_dict: list[dict[str : list[callable]]] = [ + Config.CrawlerConfig.RESPONSE_COLUMNS + ] self.failed_tag = Config.DataPackageConfig.CHILD_NOT_INIT_TAG self.try_again_seconds: int - - - @ abstractmethod - def request_and_process_data(self)-> pd.DataFrame|None: - ''' + + @abstractmethod + def request_and_process_data(self) -> pd.DataFrame | None: + """ 子类必须重写的抽象方法, 爬取数据, 处理成给定字段的pd.DataFrame, 如果爬取失败,返回None - ''' + """ pass - - - - def _run(self, datapackage: DataPackage)-> None: - ''' + + def _run(self, datapackage: DataPackage) -> None: + """ 子类无需关心的方法, 用于运行爬虫和进行错误处理 - ''' + """ result = self.request_and_process_data() - if result is None: # 返回一个请求失败的包 datapackage.tag_stack.append(self.failed_tag) - datapackage.df = pd.DataFrame({"爬取时间":['ERROR'], "响应文本":["ERROR"]}) - datapackage_str = str(datapackage).replace('\n', '\n\t') - logger.error(f"<处理器: {type(self).__name__} 环节: 无响应或响应不合法>:\n\t重新请求时间: {self.try_again_seconds}秒之后\n\t数据包细节:\n\t{datapackage_str}") + datapackage.df = pd.DataFrame({"爬取时间": ["ERROR"], "响应文本": ["ERROR"]}) + datapackage_str = str(datapackage).replace("\n", "\n\t") + logger.error( + f"<处理器: {type(self).__name__} 环节: 无响应或响应不合法>:\n\t重新请求时间: {self.try_again_seconds}秒之后\n\t数据包细节:\n\t{datapackage_str}" + ) return False - datapackage.df = result - datapackage_str = str(datapackage).replace('\n', '\n\t') - logger.success(f"<处理器: {type(self).__name__} 环节: 得到成功的响应>:\n\t数据包细节:\n\t{datapackage_str}") + datapackage_str = str(datapackage).replace("\n", "\n\t") + logger.success( + f"<处理器: {type(self).__name__} 环节: 得到成功的响应>:\n\t数据包细节:\n\t{datapackage_str}" + ) return True - - - def _rerun(self, datapackage: DataPackage)-> None: - if check_time_exceeded(datapackage.init_time, '%Y-%m-%d %H:%M:%S.%f', self.try_again_seconds): + + def _rerun(self, datapackage: DataPackage) -> None: + if check_time_exceeded( + datapackage.init_time, "%Y-%m-%d %H:%M:%S.%f", self.try_again_seconds + ): is_success = self._run(datapackage) if not is_success: # 如果二次请求还是失败,那么直接让回收器销毁数据包吧 - datapackage_str = str(datapackage).replace('\n', '\n\t') - logger.critical(f"<处理器: {type(self).__name__} 环节: 二次请求失败,停止本轮爬取任务>:\n\t数据包细节:\n\t{datapackage_str}") + datapackage_str = str(datapackage).replace("\n", "\n\t") + logger.critical( + f"<处理器: {type(self).__name__} 环节: 二次请求失败,停止本轮爬取任务>:\n\t数据包细节:\n\t{datapackage_str}" + ) datapackage.tag_stack = [] - return - # 如果没有到时间, 把处理前弹出的failed_tag补一个新的回去 datapackage.tag_stack.append(self.failed_tag) - - - -@ singleton + + +@singleton class VegetableCrawler(CrawlerInterface): def __init__(self): super().__init__() self._callback_func_dict = { Config.DataPackageConfig.VEGETABLE_PRICE_RESPONSE_GET_TAG: self._run, Config.DataPackageConfig.VEGETABLE_PRICE_CRAWL_FAILED_TAG: self._rerun, - } + } self.failed_tag = Config.DataPackageConfig.VEGETABLE_PRICE_CRAWL_FAILED_TAG self.url = Config.CrawlerConfig.VegetableCrawlerConfig.URL - self.try_again_seconds = Config.CrawlerConfig.VegetableCrawlerConfig.TRY_AGAIN_SECONDS + self.try_again_seconds = ( + Config.CrawlerConfig.VegetableCrawlerConfig.TRY_AGAIN_SECONDS + ) - def request_and_process_data(self): # 这是我实现的爬虫类, 可以参考它来编写天气和经济的爬虫 try: - response = requests.post(url = self.url, timeout=30) + response = requests.post(url=self.url, timeout=30) except: return None - text = response.text statue_code = response.status_code if statue_code != 200: return None - - result_df = pd.DataFrame({'爬取时间':[Config.CrawlerConfig.NOW_TIME_FUNC()], '响应文本':[text]}) + result_df = pd.DataFrame( + {"爬取时间": [Config.CrawlerConfig.NOW_TIME_FUNC()], "响应文本": [text]} + ) return result_df - -@ singleton +@singleton class DailyWeatherCrawler(CrawlerInterface): def __init__(self): super().__init__() self._callback_func_dict = { Config.DataPackageConfig.DAILY_WEATHER_RESPONSE_GET_TAG: self._run, Config.DataPackageConfig.DAILY_WEATHER_CRAWL_FAILED_TAG: self._rerun, - } + } self.failed_tag = Config.DataPackageConfig.DAILY_WEATHER_CRAWL_FAILED_TAG - - self.url = Config.CrawlerConfig.DailyWeatherCrawlerConfig.URL # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 + self.url = ( + Config.CrawlerConfig.DailyWeatherCrawlerConfig.URL + ) # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 # self.try_again_seconds = ??? # 请指定请求失败后的重新请求时间, 该属性必须填写 self.cityids = Config.CrawlerConfig.DailyWeatherCrawlerConfig.cityids self.headers = Config.CrawlerConfig.DailyWeatherCrawlerConfig.headers self.year = Config.CrawlerConfig.DailyWeatherCrawlerConfig.current_year self.month = Config.CrawlerConfig.DailyWeatherCrawlerConfig.current_month - + def request_and_process_data(self): - text = "" for cityid in self.cityids: - params = { - "areaInfo[areaId]": cityid, - "areaInfo[areaType]": 2, - "date[year]": self.year, - "date[month]": self.month + "areaInfo[areaId]": cityid, + "areaInfo[areaType]": 2, + "date[year]": self.year, + "date[month]": self.month, } - try: - response = requests.get(url = self.url, headers=self.headers, params=params) + response = requests.get( + url=self.url, headers=self.headers, params=params + ) text += response.text + "$$$" - except: return None - # 如果请求数据成功, 那么返回给定字段的pd.DataFrame,值得注意的是,'爬取时间'的值是固定的,也就是使用统一的Config.CrawlerConfig.NOW_TIME_FUNC函数 # '响应文本'就是获取到的源数据,它虽然需要被存储到数据库, 但对字符串本身的格式没有要求, 因为将来你还需自定义相应的数据清洗器来将它转变成规定的格式 # 如果你对需要返回的格式仍旧有疑问, 那么请转到父类CrawlerInterface的_output_columns_type_dict属性, 它规定了从这儿走出去的数据包的表格的字段需要是什么样子的 - result_df = pd.DataFrame({'爬取时间':[Config.CrawlerConfig.NOW_TIME_FUNC()], '响应文本':[text]}) - + result_df = pd.DataFrame( + {"爬取时间": [Config.CrawlerConfig.NOW_TIME_FUNC()], "响应文本": [text]} + ) + # 最后如果请求成功,则返回这个表 return result_df - - -@ singleton + + +@singleton class ExtremeWeatherCrawler(CrawlerInterface): def __init__(self): super().__init__() self._callback_func_dict = { Config.DataPackageConfig.EXTREME_WEATHER_RESPONSE_GET_TAG: self._run, Config.DataPackageConfig.EXTREME_WEATHER_CRAWL_FAILED_TAG: self._rerun, - } + } self.failed_tag = Config.DataPackageConfig.EXTREME_WEATHER_CRAWL_FAILED_TAG - - self.url = Config.CrawlerConfig.ExtremeWeatherCrawlerConfig.URL # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 + + self.url = ( + Config.CrawlerConfig.ExtremeWeatherCrawlerConfig.URL + ) # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 # self.try_again_seconds = ??? # 请指定请求失败后的重新请求时间, 该属性必须填写 - def request_and_process_data(self): - is_request_success = False # 请求是否成功? + is_request_success = False # 请求是否成功? # 请求数据 # ... # 得到的数据以字符串存储 text = "1234567" - + if not is_request_success: # 请求失败则返回None return None - - + # 如果请求数据成功, 那么返回给定字段的pd.DataFrame,值得注意的是,'爬取时间'的值是固定的,也就是使用统一的Config.CrawlerConfig.NOW_TIME_FUNC函数 # '响应文本'就是获取到的源数据,它虽然需要被存储到数据库, 但对字符串本身的格式没有要求, 因为将来你还需自定义相应的数据清洗器来将它转变成规定的格式 # 如果你对需要返回的格式仍旧有疑问, 那么请转到父类CrawlerInterface的_output_columns_type_dict属性, 它规定了从这儿走出去的数据包的表格的字段需要是什么样子的 - result_df = pd.DataFrame({'爬取时间':[Config.CrawlerConfig.NOW_TIME_FUNC()], '响应文本':[text]}) - + result_df = pd.DataFrame( + {"爬取时间": [Config.CrawlerConfig.NOW_TIME_FUNC()], "响应文本": [text]} + ) + # 最后如果请求成功,则返回这个表 return result_df - - -@ singleton + + +@singleton class EconomyCrawler(CrawlerInterface): def __init__(self): super().__init__() self._callback_func_dict = { Config.DataPackageConfig.ECONOMY_RESPONSE_GET_TAG: self._run, Config.DataPackageConfig.ECONOMY_CRAWL_FAILED_TAG: self._rerun, - } + } self.failed_tag = Config.DataPackageConfig.ECONOMY_CRAWL_FAILED_TAG - self.url = Config.CrawlerConfig.EconomyCrawlerConfig.URL # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 + self.url = ( + Config.CrawlerConfig.EconomyCrawlerConfig.URL + ) # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 # self.try_again_seconds = ??? # 请指定请求失败后的重新请求时间, 该属性必须填写 - def request_and_process_data(self): - is_request_success = False # 请求是否成功? + is_request_success = False # 请求是否成功? # 请求数据 # ... # 得到的数据以字符串存储 text = "1234567" - + if not is_request_success: # 请求失败则返回None return None - - + # 如果请求数据成功, 那么返回给定字段的pd.DataFrame,值得注意的是,'爬取时间'的值是固定的,也就是使用统一的Config.CrawlerConfig.NOW_TIME_FUNC函数 # '响应文本'就是获取到的源数据,它虽然需要被存储到数据库, 但对字符串本身的格式没有要求, 因为将来你还需自定义相应的数据清洗器来将它转变成规定的格式 # 如果你对需要返回的格式仍旧有疑问, 那么请转到父类CrawlerInterface的_output_columns_type_dict属性, 它规定了从这儿走出去的数据包的表格的字段需要是什么样子的 - result_df = pd.DataFrame({'爬取时间':[Config.CrawlerConfig.NOW_TIME_FUNC()], '响应文本':[text]}) - + result_df = pd.DataFrame( + {"爬取时间": [Config.CrawlerConfig.NOW_TIME_FUNC()], "响应文本": [text]} + ) + # 最后如果请求成功,则返回这个表 return result_df - -@ singleton -class OilCrawler(CrawlerInterface): + + +@singleton +class OilPriceCrawler(CrawlerInterface): def __init__(self): super().__init__() self._callback_func_dict = { Config.DataPackageConfig.OIL_PRICE_RESPONSE_GET_TAG: self._run, Config.DataPackageConfig.OIL_PRICE_CRAWL_FAILED_TAG: self._rerun, - } + } self.failed_tag = Config.DataPackageConfig.OIL_PRICE_CRAWL_FAILED_TAG - - self.url = Config.CrawlerConfig.OilCrawlerConfig.URL # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 - # self.try_again_seconds = ??? # 请指定请求失败后的重新请求时间, 该属性必须填写 + self.url = Config.CrawlerConfig.OilCrawlerConfig.URL + # 通过这里来了解如何从配置文件获取常量, 当然你可以在自己的配置类里增加任何内容 + # self.try_again_seconds = ??? + self.try_again_seconds = ( + Config.CrawlerConfig.VegetableCrawlerConfig.TRY_AGAIN_SECONDS + ) + # 请指定请求失败后的重新请求时间, 该属性必须填写 - def request_and_process_data(self): - - #爬取函数,调用json_process,返回内容为一个列表,第一个值是爬取内容(文本),第二个值是状态码 - def climb(): - web_url=json_process(0) - if web_url == "http://youjia.10260.com/chaiyou/": - response = requests.get(web_url) - code= response.status_code - print("爬取成功") - return [response.text,code] - elif web_url == "https://www.cngold.org/crude/chaiyou.html": - print("Error") - return(None) - elif web_url == "https://www.5waihui.com/oil/cn/": - print("Error") - return(None) - elif web_url == "https://www.05348.com/": - print("Error") - return(None) - elif web_url == "http://gas.humeup.cn/0haochaiyou/": - print("Error") - return(None) - else: - print("Error") - return(None) - - #json文件处理,返回一个URL - def json_process(process_choice): - - process=["oil_web_list","readme","position_name"] - - #打开json库 - with open(f'{os.getcwd()}\\oil_web_list.json', 'r',encoding = "utf-8") as pf: - txt=pf.read() - json_dic = json.loads(txt)[process[process_choice]] - pf.close() - - #调用oil_web_list: - if process_choice==0: - - web=["first","second","third","fourth","fivth"] - web_choice=0 - - #尝试连接网站 - while True: - response = requests.get(json_dic[web[web_choice]]) - code = response.status_code - if code==200: - web_url=json_dic[web[web_choice]] - print("Connect successfully:\t"+web_url) - break - else: - if web_choice<4: - web_choice+=1 - else: - web_choice=0 - return str(web_url) - - #调用readme: - if process_choice==1: - return json_dic - - #调用position_name: - if process_choice==2: - return json_dic - - time = datetime.datetime.now() - - #使用xpath解析网页元素,输出一个键为省名,值一个列表为字典 - def analyze_with_xpath(file): - with open(file,'r',encoding="utf-8") as pf: - r= pf.read() - tree = etree.HTML(r) - bows_values = tree.xpath('//td/text()') - print("bows_values:") - print(bows_values) - - #打包成组 - bows_values_packaged=[] - for i in range(int(len(bows_values)/5)):#组的序号 - package=[bows_values[i*5],bows_values[i*5+1],bows_values[i*5+2],bows_values[i*5+3],bows_values[i*5+4]] - bows_values_packaged.append(package) - print("bows_values_packaged:") - print(bows_values_packaged) - - #将json文件当中“position_name”的键名转化成bows_keys列表的值 - bows={} - bows_keys=list(OilCrawler.json_process(2)) - for i in range(3): - bows_keys.pop() - i=0#打包的序号 - for bow_key in bows_keys: - bows[bow_key]=bows_values_packaged[i] - i+=1 - print('解析已完成') - - return bows - - - #循环部分,在下一行设置具体时间 - if time.strftime("%H:%M:%S")=="11:00:00": - - #创建html文本文件并保存 - file_name=f"{time.strftime('%Y-%m-%d')}_oil_price" - f=open(f"{file_name}.html","w+",encoding="utf-8") - f.write(climb()[0]) - - #html文本文件出现空行问题,重新读取一遍数据删除空行 - _f=open(f"_{file_name}.html","w+",encoding="utf-8") - f.seek(0) - rl=f.readlines() - for i in rl: - if i != "\n" : - _f.write(i) - - #输出时间 - print(time) - - f.close() - _f.close() - - data=analyze_with_xpath(f"_{file_name}.html") - print(data) - - #将数据写入json文件 - with open(f"{file_name}.json","w",encoding="utf-8") as _f: - json.dump(data,_f) - - sleep(60)#防止重复爬取 - - is_request_success = True # 请求是否成功? + is_request_success = False # 请求是否成功? # 请求数据 # ... # 得到的数据以字符串存储 - text = str(json.dump(data,_f)) - + text = "1234567" + if not is_request_success: # 请求失败则返回None return None - - + # 如果请求数据成功, 那么返回给定字段的pd.DataFrame,值得注意的是,'爬取时间'的值是固定的,也就是使用统一的Config.CrawlerConfig.NOW_TIME_FUNC函数 # '响应文本'就是获取到的源数据,它虽然需要被存储到数据库, 但对字符串本身的格式没有要求, 因为将来你还需自定义相应的数据清洗器来将它转变成规定的格式 # 如果你对需要返回的格式仍旧有疑问, 那么请转到父类CrawlerInterface的_output_columns_type_dict属性, 它规定了从这儿走出去的数据包的表格的字段需要是什么样子的 - result_df = pd.DataFrame({'爬取时间':[Config.CrawlerConfig.NOW_TIME_FUNC()], '响应文本':[text]}) - + result_df = pd.DataFrame( + {"爬取时间": [Config.CrawlerConfig.NOW_TIME_FUNC()], "响应文本": [text]} + ) + # 最后如果请求成功,则返回这个表 - return result_df \ No newline at end of file + return result_df + + diff --git a/oil_price/oil.py b/oil_price/oil.py new file mode 100644 index 0000000..566a68e --- /dev/null +++ b/oil_price/oil.py @@ -0,0 +1,156 @@ +#数据爬取 +#20231020 + +import requests,datetime,json,os +from time import sleep +from lxml import etree + +class oil(): + + #json文件处理,返回一个URL + def json_process(process_choice): + + process=["oil_web_list","readme","position_name"] + + #打开json库 + with open(f'{os.getcwd()}\\oil_web_list.json', 'r',encoding = "utf-8") as pf: + txt=pf.read() + json_dic = json.loads(txt)[process[process_choice]] + pf.close() + + #调用oil_web_list: + if process_choice==0: + + web=["first","second","third","fourth","fifth"] + web_choice=0 + + #尝试连接网站 + while True: + response = requests.get(json_dic[web[web_choice]]) + code = response.status_code + if code==200: + web_url=json_dic[web[web_choice]] + print("Connect successfully:\t"+web_url) + break + else: + if web_choice<4: + web_choice+=1 + else: + web_choice=0 + return str(web_url) + + #调用readme: + if process_choice==1: + return json_dic + + #调用position_name: + if process_choice==2: + return json_dic + + #使用xpath解析网页元素,输出一个键为省名,值一个列表为字典 + def analyze_with_xpath(file): + with open(file,'r',encoding="utf-8") as pf: + r= pf.read() + tree = etree.HTML(r) + bows_values = tree.xpath('//td/text()') + print("bows_values:") + print(bows_values) + + #打包成组 + bows_values_packaged=[] + for i in range(int(len(bows_values)/5)):#组的序号 + package=[bows_values[i*5],bows_values[i*5+1],bows_values[i*5+2],bows_values[i*5+3],bows_values[i*5+4]] + bows_values_packaged.append(package) + print("bows_values_packaged:") + print(bows_values_packaged) + + #将json文件当中“position_name”的键名转化成bows_keys列表的值 + bows={} + bows_keys=list(oil.json_process(2)) + for i in range(3): + bows_keys.pop() + i=0#打包的序号 + for bow_key in bows_keys: + bows[bow_key]=bows_values_packaged[i] + i+=1 + print('解析已完成') + + return bows + + #更改工作地址 + def change_position(): + retval = os.getcwd() + path="\\elements" + if retval!=path: + os.chdir(path) + + #时间调用示例 + def time_test(): + # 获取当前日期和时间 + now = datetime.datetime.now() + # 格式化日期和时间 + formatted_date = now.strftime("%Y-%m-%d") + formatted_time = now.strftime("%H:%M:%S") + #格式化后的日期: 2023-06-16 + #格式化后的时间: 16:32:00 + + #爬取,调用json_process,返回内容为一个列表,第一个值是爬取内容(文本),第二个值是状态码 + def climb(): + web_url=oil.json_process(0) + if web_url == "http://youjia.10260.com/chaiyou/": + response = requests.get(web_url) + code= response.status_code + print("爬取成功") + return [response.text,code] + elif web_url == "https://www.cngold.org/crude/chaiyou.html": + print("Error") + return(None) + elif web_url == "https://www.5waihui.com/oil/cn/": + print("Error") + return(None) + elif web_url == "https://www.05348.com/": + print("Error") + return(None) + elif web_url == "http://gas.humeup.cn/0haochaiyou/": + print("Error") + return(None) + else: + print("Error") + return(None) + + + #运行部分,是一个循环,每24小时一次 + while True: + time = datetime.datetime.now() + + #在下一行设置具体时间 + if time.strftime("%H:%M:%S")=="13:11:00": + + #创建html文本文件并保存 + file_name=f"{time.strftime('%Y-%m-%d')}_oil_price" + f=open(f"{file_name}.html","w+",encoding="utf-8") + f.write(climb()[0]) + + #html文本文件出现空行问题,重新读取一遍数据删除空行 + _f=open(f"_{file_name}.html","w+",encoding="utf-8") + f.seek(0) + rl=f.readlines() + for i in rl: + if i != "\n" : + _f.write(i) + + #输出时间 + print(time) + + f.close() + _f.close() + + data=analyze_with_xpath(f"_{file_name}.html") + print(data) + + #将数据写入json文件 + with open(f"{file_name}.json","w",encoding="utf-8") as _f: + json.dump(data,_f) + + sleep(60)#防止重复爬取 + diff --git a/oil_web_list.json b/oil_price/oil_web_list.json similarity index 95% rename from oil_web_list.json rename to oil_price/oil_web_list.json index 9773549..977fc22 100644 --- a/oil_web_list.json +++ b/oil_price/oil_web_list.json @@ -4,7 +4,7 @@ "second":"https://www.cngold.org/crude/chaiyou.html", "third":"https://www.5waihui.com/oil/cn/", "fourth":"https://www.05348.com/", - "fivth":"http://gas.humeup.cn/0haochaiyou/" + "fifth":"http://gas.humeup.cn/0haochaiyou/" }, "readme":"存放了五个网站,在前一个网站不稳定时,将会切换到后一个网站", "position_name":{ -- Gitee From 2863d0de073e2132accd66055c2d5e455102bdc8 Mon Sep 17 00:00:00 2001 From: qinxiao <1872826298@163.com> Date: Thu, 23 Nov 2023 21:52:56 +0800 Subject: [PATCH 3/4] 123 --- 2023-11-21_oil_price.html | 0 oil_price/oil.py | 294 +++++++++++++++++++------------------- 2 files changed, 148 insertions(+), 146 deletions(-) create mode 100644 2023-11-21_oil_price.html diff --git a/2023-11-21_oil_price.html b/2023-11-21_oil_price.html new file mode 100644 index 0000000..e69de29 diff --git a/oil_price/oil.py b/oil_price/oil.py index 566a68e..3c7a216 100644 --- a/oil_price/oil.py +++ b/oil_price/oil.py @@ -5,152 +5,154 @@ import requests,datetime,json,os from time import sleep from lxml import etree -class oil(): - - #json文件处理,返回一个URL - def json_process(process_choice): - - process=["oil_web_list","readme","position_name"] - - #打开json库 - with open(f'{os.getcwd()}\\oil_web_list.json', 'r',encoding = "utf-8") as pf: - txt=pf.read() - json_dic = json.loads(txt)[process[process_choice]] - pf.close() - - #调用oil_web_list: - if process_choice==0: - - web=["first","second","third","fourth","fifth"] - web_choice=0 - - #尝试连接网站 - while True: - response = requests.get(json_dic[web[web_choice]]) - code = response.status_code - if code==200: - web_url=json_dic[web[web_choice]] - print("Connect successfully:\t"+web_url) - break + + #json文件处理,返回一个URL +def json_process(process_choice): + + process=["oil_web_list","readme","position_name"] + + #打开json库 + with open(f'{os.getcwd()}\\oil_web_list.json', 'r',encoding = "utf-8") as pf: + txt=pf.read() + json_dic = json.loads(txt)[process[process_choice]] + pf.close() + + #调用oil_web_list: + if process_choice==0: + + web=["first","second","third","fourth","fifth"] + web_choice=0 + + #尝试连接网站 + while True: + response = requests.get(json_dic[web[web_choice]]) + code = response.status_code + if code==200: + web_url=json_dic[web[web_choice]] + print("Connect successfully:\t"+web_url) + break + else: + if web_choice<4: + web_choice+=1 else: - if web_choice<4: - web_choice+=1 - else: - web_choice=0 - return str(web_url) - - #调用readme: - if process_choice==1: - return json_dic - - #调用position_name: - if process_choice==2: - return json_dic - - #使用xpath解析网页元素,输出一个键为省名,值一个列表为字典 - def analyze_with_xpath(file): - with open(file,'r',encoding="utf-8") as pf: - r= pf.read() - tree = etree.HTML(r) - bows_values = tree.xpath('//td/text()') - print("bows_values:") - print(bows_values) - - #打包成组 - bows_values_packaged=[] - for i in range(int(len(bows_values)/5)):#组的序号 - package=[bows_values[i*5],bows_values[i*5+1],bows_values[i*5+2],bows_values[i*5+3],bows_values[i*5+4]] - bows_values_packaged.append(package) - print("bows_values_packaged:") - print(bows_values_packaged) - - #将json文件当中“position_name”的键名转化成bows_keys列表的值 - bows={} - bows_keys=list(oil.json_process(2)) - for i in range(3): - bows_keys.pop() - i=0#打包的序号 - for bow_key in bows_keys: - bows[bow_key]=bows_values_packaged[i] - i+=1 - print('解析已完成') - - return bows - - #更改工作地址 - def change_position(): - retval = os.getcwd() - path="\\elements" - if retval!=path: - os.chdir(path) - - #时间调用示例 - def time_test(): - # 获取当前日期和时间 - now = datetime.datetime.now() - # 格式化日期和时间 - formatted_date = now.strftime("%Y-%m-%d") - formatted_time = now.strftime("%H:%M:%S") - #格式化后的日期: 2023-06-16 - #格式化后的时间: 16:32:00 + web_choice=0 + return str(web_url) + + #调用readme: + if process_choice==1: + return json_dic + + #调用position_name: + if process_choice==2: + return json_dic + +#使用xpath解析网页元素,输出一个键为省名,值一个列表为字典 +def analyze_with_xpath(file): + with open(file,'r',encoding="utf-8") as pf: + r= pf.read() + tree = etree.HTML(r) + bows_values = tree.xpath('//td/text()') + print("bows_values:") + print(bows_values) + + #打包成组 + bows_values_packaged=[] + for i in range(int(len(bows_values)/5)):#组的序号 + package=[bows_values[i*5],bows_values[i*5+1],bows_values[i*5+2],bows_values[i*5+3],bows_values[i*5+4]] + bows_values_packaged.append(package) + print("bows_values_packaged:") + print(bows_values_packaged) + + #将json文件当中“position_name”的键名转化成bows_keys列表的值 + bows={} + bows_keys=list(json_process(2)) + for i in range(3): + bows_keys.pop() + i=0#打包的序号 + for bow_key in bows_keys: + bows[bow_key]=bows_values_packaged[i] + i+=1 + print('解析已完成') + + return bows + +#更改工作地址 +def change_position(): + retval = os.getcwd() + print(retval) + path="C:\\VPO\\oilData" + os.mkdir(path) + if retval!=path: + os.chdir(path) + print(os.getcwd()) + +#时间调用示例 +def time_test(): + # 获取当前日期和时间 + now = datetime.datetime.now() + # 格式化日期和时间 + formatted_date = now.strftime("%Y-%m-%d") + formatted_time = now.strftime("%H:%M:%S") + #格式化后的日期: 2023-06-16 + #格式化后的时间: 16:32:00 + +#爬取,调用json_process,返回内容为一个列表,第一个值是爬取内容(文本),第二个值是状态码 +def climb(): + web_url=json_process(0) + if web_url == "http://youjia.10260.com/chaiyou/": + response = requests.get(web_url) + code= response.status_code + print("爬取成功") + return [response.text,code] + elif web_url == "https://www.cngold.org/crude/chaiyou.html": + print("Error") + return(None) + elif web_url == "https://www.5waihui.com/oil/cn/": + print("Error") + return(None) + elif web_url == "https://www.05348.com/": + print("Error") + return(None) + elif web_url == "http://gas.humeup.cn/0haochaiyou/": + print("Error") + return(None) + else: + print("Error") + return(None) + + +#运行部分,是一个循环,每24小时一次 +while True: + time = datetime.datetime.now() + #change_position() + + #在下一行设置具体时间 + if time.strftime("%H:%M:%S")=="13:11:00" or True: + #创建html文本文件并保存 + file_name=f"{time.strftime('%Y-%m-%d')}_oil_price" + f=open(f"{file_name}.html","w+",encoding="utf-8") + f.write(climb()[0]) + + #html文本文件出现空行问题,重新读取一遍数据删除空行 + _f=open(f"_{file_name}.html","w+",encoding="utf-8") + f.seek(0) + rl=f.readlines() + for i in rl: + if i != "\n" : + _f.write(i) + + #输出时间 + print(time) - #爬取,调用json_process,返回内容为一个列表,第一个值是爬取内容(文本),第二个值是状态码 - def climb(): - web_url=oil.json_process(0) - if web_url == "http://youjia.10260.com/chaiyou/": - response = requests.get(web_url) - code= response.status_code - print("爬取成功") - return [response.text,code] - elif web_url == "https://www.cngold.org/crude/chaiyou.html": - print("Error") - return(None) - elif web_url == "https://www.5waihui.com/oil/cn/": - print("Error") - return(None) - elif web_url == "https://www.05348.com/": - print("Error") - return(None) - elif web_url == "http://gas.humeup.cn/0haochaiyou/": - print("Error") - return(None) - else: - print("Error") - return(None) - - - #运行部分,是一个循环,每24小时一次 - while True: - time = datetime.datetime.now() + f.close() + _f.close() + + data=analyze_with_xpath(f"_{file_name}.html") + print(data) + + #将数据写入json文件 + with open(f"{file_name}.json","w",encoding="utf-8") as _f: + json.dump(data,_f) + + sleep(60)#防止重复爬取 - #在下一行设置具体时间 - if time.strftime("%H:%M:%S")=="13:11:00": - - #创建html文本文件并保存 - file_name=f"{time.strftime('%Y-%m-%d')}_oil_price" - f=open(f"{file_name}.html","w+",encoding="utf-8") - f.write(climb()[0]) - - #html文本文件出现空行问题,重新读取一遍数据删除空行 - _f=open(f"_{file_name}.html","w+",encoding="utf-8") - f.seek(0) - rl=f.readlines() - for i in rl: - if i != "\n" : - _f.write(i) - - #输出时间 - print(time) - - f.close() - _f.close() - - data=analyze_with_xpath(f"_{file_name}.html") - print(data) - - #将数据写入json文件 - with open(f"{file_name}.json","w",encoding="utf-8") as _f: - json.dump(data,_f) - - sleep(60)#防止重复爬取 - -- Gitee From 6df6b1b9783c0f6c07ea8780fca6e1e0e36e7d6f Mon Sep 17 00:00:00 2001 From: qinxiao <1872826298@163.com> Date: Thu, 14 Dec 2023 23:21:51 +0800 Subject: [PATCH 4/4] 123 --- t1.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 t1.txt diff --git a/t1.txt b/t1.txt deleted file mode 100644 index 2420738..0000000 --- a/t1.txt +++ /dev/null @@ -1 +0,0 @@ -asdfghjk -- Gitee