1 Star 0 Fork 0

张奕驰/note(笔记)

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
5_15毕设 11.69 KB
一键复制 编辑 原始数据 按行查看 历史
张奕驰 提交于 2023-05-15 09:04 . add 5_15毕设.
import csv
import requests
from lxml import etree
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd
import numpy as np
#设置 matplotlib 使用 'agg' 后端
matplotlib.use('agg')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
# 获取食材列表
def get_ingredients_list(e):
ingredients_list = []
for p in e.xpath("//p[@class='ing ellipsis']"):
ingredients = p.xpath(".//a/text()")
if ingredients:
ingredients_list.append(ingredients)
return ingredients_list
# 爬取食谱数据
def scrape_recipe_data(category_id, start_page, end_page):
recipe_names = []
ingredients_list = []
scores = []
for page in range(start_page, end_page + 1):
print(f"正在爬取第 {page} 页...")
url = f"https://www.xiachufang.com/category/{category_id}/?page={page}"
res = requests.get(url, headers=headers)
e = etree.HTML(res.text)
recipe_names.extend([name.strip() for name in e.xpath('//p[@class="name"]/a/text()') if name.strip()])
ingredients_list.extend(get_ingredients_list(e))
scores.extend(e.xpath("//p[@class='stats']/span/text()"))
return recipe_names, ingredients_list, scores
# 绘制食材出现次数饼图
def draw_pie_chart(ingredient_counts, filename):
top_ingredients = ingredient_counts.most_common(10)
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
labels, sizes = zip(*top_ingredients)
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('食材出现次数分布')
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.close()
# 绘制评分出现次数柱状图
def draw_bar_chart(scores, filename):
score_counts = Counter(scores)
top_scores = score_counts.most_common(10)
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
labels, heights = zip(*top_scores)
plt.bar(labels, heights)
plt.xlabel('评分')
plt.ylabel('出现次数')
plt.title('评分出现次数分布')
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.close()
# 绘制评分分布箱型图
def draw_box_plot(scores, filename):
scores = [float(score) for score in scores] # 将字符串类型的评分转换为浮点数类型
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
boxprops = dict(linestyle='-', linewidth=1.5, color='darkgoldenrod')
patch_artist = True
whiskerprops = dict(linestyle='-', linewidth=1.5, color='darkgoldenrod')
medianprops = dict(linestyle='-', linewidth=1.5, color='firebrick')
capprops = dict(linestyle='-', linewidth=1.5, color='darkgoldenrod')
box_plot = plt.boxplot(scores, boxprops=boxprops, whiskerprops=whiskerprops, medianprops=medianprops,
capprops=capprops, patch_artist=True)
# 设置箱体的填充颜色
for box in box_plot['boxes']:
box.set(facecolor='lightgoldenrodyellow')
plt.xlabel('评分')
plt.ylabel('分布')
plt.title('评分分布箱型图')
plt.grid(axis='y', linestyle='--', linewidth=0.5, alpha=0.6)
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.close()
#计算共现矩阵
def calculate_cooccurrence_matrix(ingredients_list):
unique_ingredients = sorted(list(ingredient_counts.keys()))
ingredient_indices = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}
cooccurrence_matrix = np.zeros((len(unique_ingredients), len(unique_ingredients)), dtype=np.int32)
for ingredients in ingredients_list:
for i, ingredient1 in enumerate(ingredients):
for ingredient2 in ingredients[i + 1:]:
cooccurrence_matrix[ingredient_indices[ingredient1], ingredient_indices[ingredient2]] += 1
cooccurrence_matrix[ingredient_indices[ingredient2], ingredient_indices[ingredient1]] += 1
return pd.DataFrame(cooccurrence_matrix, index=unique_ingredients, columns=unique_ingredients)
#绘制食谱名称长度的箱型图
def draw_recipe_name_length_box_plot(recipe_names, filename):
name_lengths = [len(name) for name in recipe_names]
plt.boxplot(name_lengths)
plt.xlabel('食谱名字长度')
plt.ylabel('分布')
plt.title('食谱名字长度分布箱型图')
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.close()
#绘制横向柱状图
def draw_horizontal_bar_chart(ingredient_counts, filename):
top_ingredients = ingredient_counts.most_common(10)
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
labels, heights = zip(*top_ingredients)
indices = np.arange(len(labels))
plt.barh(indices, heights, color='steelblue', alpha=0.7)
plt.yticks(indices, labels)
plt.xlabel('出现次数')
plt.ylabel('食材')
plt.title('食材出现次数横向柱状图')
plt.gca().invert_yaxis() # 倒序排列,使得次数多的食材在上方
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.close()
def save_data_to_csv(recipe_names, ingredients_list, scores, filename):
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['Recipe Name', 'Ingredients', 'Score'])
for name, ingredients, score in zip(recipe_names, ingredients_list, scores):
csv_writer.writerow([name, ', '.join(ingredients), score])
#绘制评分与食材的曲线图
def plot_ratings_ingredients_curve(ingredients, ratings, save_path):
# 绘制曲线图
plt.plot(ingredients, ratings)
# 设置标题和坐标轴标签
plt.title('评分与食材曲线图')
plt.xlabel('食材')
plt.ylabel('评分')
# 设置美观
plt.xticks(rotation=45)
plt.grid(True)
# 保存图形为文件
plt.savefig(save_path, dpi=300, bbox_inches='tight')
# 显示图形
plt.show()
def plot_ratings_ingredients_curve(ingredients, ratings, save_path):
# 绘制曲线图
plt.plot(ingredients, ratings)
# 设置标题和坐标轴标签
plt.title('评分与食材曲线图')
plt.xlabel('食材')
plt.ylabel('评分')
# 设置美观
plt.xticks(rotation=45)
plt.grid(True)
# 保存图形为文件
plt.savefig(save_path, dpi=300, bbox_inches='tight')
plt.close()
#抓取香哈网数据
def scrape_xiangha_data(start_page, end_page):
recipe_names = []
ingredients_list = []
scores = []
for page in range(start_page, end_page + 1):
url = f'https://www.xiangha.com/caipu/c-jiachang/hot-{page}/'
res = requests.get(url, headers=headers)
e = etree.HTML(res.text)
recipe_names.extend([name.strip() for name in e.xpath("//p[@class='name kw']/a/text()") if name.strip()])
for p in e.xpath("//div[@class='ins']"):
ingredients = p.xpath(".//p/text()")
cleaned_ingredients = [item.strip() for item in ingredients if item.strip()]
if cleaned_ingredients:
ingredients_list.append(cleaned_ingredients)
raw_scores = [item.strip() for item in e.xpath("//div[@class='ins']/p[3]/text()") if item.strip()]
cleaned_scores = [float(item.split("收藏")[0]) for item in raw_scores if item.endswith("收藏")] # 清洗评分数据
scores.extend(cleaned_scores)
return recipe_names, ingredients_list, scores
#绘制散点图
def calculate_average_scores(ingredients_list, scores):
float_scores = [float(score) for score in scores]
ingredient_score_map = {}
for ingredients, score in zip(ingredients_list, float_scores):
for ingredient in ingredients:
if ingredient in ingredient_score_map:
ingredient_score_map[ingredient].append(score)
else:
ingredient_score_map[ingredient] = [score]
ingredient_avg_score_map = {ingredient: np.mean(scores) for ingredient, scores in ingredient_score_map.items()}
return ingredient_avg_score_map
def plot_ingredient_score_scatter(ingredient_counts, ingredient_avg_score_map, bin_num=30):
common_ingredients = set(ingredient_counts.keys()).intersection(set(ingredient_avg_score_map.keys()))
counts = [ingredient_counts[ingredient] for ingredient in common_ingredients]
avg_scores = [score for score in ingredient_avg_score_map.values() if np.isfinite(score)]
# 数据分箱
bins = np.linspace(min(counts), max(counts), bin_num) # 创建等距的分箱
digitized = np.digitize(counts, bins) # 将数据进行分箱
bin_means = [np.mean(avg_scores[digitized == i]) if np.sum(digitized == i) > 0 else 0 for i in range(1, len(bins))]
plt.scatter(bins[:-1], bin_means, alpha=0.5) # 绘制散点图,使用alpha参数调整透明度
plt.xlabel('Ingredient Counts')
plt.ylabel('Average Scores')
plt.title('Scatter Plot of Ingredient Counts vs Average Scores')
plt.savefig('sandian.png', dpi=300, bbox_inches='tight') # Change file name here
plt.close()
#调用 scrape_recipe_data 函数,爬取两个分类下的食谱数据,包括食谱名字、食材列表和评分。
recipe_names_1, ingredients_list_1, scores_1 = scrape_recipe_data(40077, 1, 20)
recipe_names_2, ingredients_list_2, scores_2 = scrape_recipe_data(40076, 1, 20)
# 统计食材出现次数
ingredient_counts = Counter()
for ingredients in ingredients_list_1 + ingredients_list_2:
for ingredient in ingredients:
ingredient_counts[ingredient] += 1
# 绘制食材出现次数饼图
draw_pie_chart(ingredient_counts, 'binzhuang.png')
# 合并两个评分列表
combined_scores = scores_1 + scores_2
# 绘制评分出现次数柱状图
draw_bar_chart(combined_scores, 'score.png')
# 绘制评分分布箱型图
draw_box_plot(combined_scores, 'score_diffrent.png')
#绘制直方图
draw_recipe_name_length_box_plot(recipe_names_1 + recipe_names_2, 'recipe_name_length_box_plot.png')
# 调用函数,绘制食材出现次数横向柱状图
draw_horizontal_bar_chart(ingredient_counts, 'ingredient_count_horizontal_bar_chart.png')
# 合并爬取到的数据
combined_recipe_names = recipe_names_1 + recipe_names_2
combined_ingredients_list = ingredients_list_1 + ingredients_list_2
combined_scores = scores_1 + scores_2
# 保存数据到 CSV 文件
save_data_to_csv(combined_recipe_names, combined_ingredients_list, combined_scores, 'meishi.csv')
# 从爬取的数据中提取前十种食材
top_10_ingredients = [ingredient for ingredient, _ in ingredient_counts.most_common(10)]
# 示例评分数据,您可以替换为实际评分数据
ratings = list(range(1, len(top_10_ingredients) + 1))
# 调用函数绘制曲线图并保存为文件
plot_ratings_ingredients_curve(top_10_ingredients, ratings, 'quxian.png')
# 从香哈网抓取数据
xiangha_recipe_names, xiangha_ingredients_list, xiangha_scores = scrape_xiangha_data(1, 40)
# 合并数据
combined_recipe_names += xiangha_recipe_names
combined_ingredients_list += xiangha_ingredients_list
combined_scores += xiangha_scores
# 更新食材计数
for ingredients in xiangha_ingredients_list:
for ingredient in ingredients:
ingredient_counts[ingredient] += 1
# 将合并后的数据保存到CSV文件
# calculate average scores for each ingredient
ingredient_avg_score_map = calculate_average_scores(combined_ingredients_list, combined_scores)
# plot scatter plot
plot_ingredient_score_scatter(ingredient_counts, ingredient_avg_score_map)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/cx3196463021/note---notes.git
git@gitee.com:cx3196463021/note---notes.git
cx3196463021
note---notes
note(笔记)
master

搜索帮助