note---notes
/
5_15毕设

import csv
import requests
from lxml import etree
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd
import numpy as np
#设置 matplotlib 使用 'agg' 后端
matplotlib.use('agg')

headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
# 获取食材列表
def get_ingredients_list(e):
   ingredients_list = []
   for p in e.xpath("//p[@class='ing ellipsis']"):
       ingredients = p.xpath(".//a/text()")
       if ingredients:
           ingredients_list.append(ingredients)
   return ingredients_list
# 爬取食谱数据
def scrape_recipe_data(category_id, start_page, end_page):
   recipe_names = []
   ingredients_list = []
   scores = []

   for page in range(start_page, end_page + 1):
       print(f"正在爬取第 {page} 页...")
       url = f"https://www.xiachufang.com/category/{category_id}/?page={page}"
       res = requests.get(url, headers=headers)
       e = etree.HTML(res.text)

       recipe_names.extend([name.strip() for name in e.xpath('//p[@class="name"]/a/text()') if name.strip()])
       ingredients_list.extend(get_ingredients_list(e))
       scores.extend(e.xpath("//p[@class='stats']/span/text()"))
   return recipe_names, ingredients_list, scores
# 绘制食材出现次数饼图
def draw_pie_chart(ingredient_counts, filename):
   top_ingredients = ingredient_counts.most_common(10)

   matplotlib.rcParams['font.sans-serif'] = ['SimHei']
   matplotlib.rcParams['axes.unicode_minus'] = False

   labels, sizes = zip(*top_ingredients)
   plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
   plt.axis('equal')
   plt.title('食材出现次数分布')

   plt.savefig(filename, dpi=300, bbox_inches='tight')
   plt.close()
# 绘制评分出现次数柱状图
def draw_bar_chart(scores, filename):
   score_counts = Counter(scores)
   top_scores = score_counts.most_common(10)

   matplotlib.rcParams['font.sans-serif'] = ['SimHei']
   matplotlib.rcParams['axes.unicode_minus'] = False

   labels, heights = zip(*top_scores)
   plt.bar(labels, heights)
   plt.xlabel('评分')
   plt.ylabel('出现次数')
   plt.title('评分出现次数分布')

   plt.savefig(filename, dpi=300, bbox_inches='tight')
   plt.close()

# 绘制评分分布箱型图
def draw_box_plot(scores, filename):
   scores = [float(score) for score in scores]  # 将字符串类型的评分转换为浮点数类型
   matplotlib.rcParams['font.sans-serif'] = ['SimHei']
   matplotlib.rcParams['axes.unicode_minus'] = False

   boxprops = dict(linestyle='-', linewidth=1.5, color='darkgoldenrod')
   patch_artist = True
   whiskerprops = dict(linestyle='-', linewidth=1.5, color='darkgoldenrod')
   medianprops = dict(linestyle='-', linewidth=1.5, color='firebrick')
   capprops = dict(linestyle='-', linewidth=1.5, color='darkgoldenrod')

   box_plot = plt.boxplot(scores, boxprops=boxprops, whiskerprops=whiskerprops, medianprops=medianprops,
                          capprops=capprops, patch_artist=True)

   # 设置箱体的填充颜色
   for box in box_plot['boxes']:
       box.set(facecolor='lightgoldenrodyellow')

   plt.xlabel('评分')
   plt.ylabel('分布')
   plt.title('评分分布箱型图')
   plt.grid(axis='y', linestyle='--', linewidth=0.5, alpha=0.6)

   plt.savefig(filename, dpi=300, bbox_inches='tight')
   plt.close()
#计算共现矩阵
def calculate_cooccurrence_matrix(ingredients_list):
   unique_ingredients = sorted(list(ingredient_counts.keys()))
   ingredient_indices = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}

   cooccurrence_matrix = np.zeros((len(unique_ingredients), len(unique_ingredients)), dtype=np.int32)

   for ingredients in ingredients_list:
       for i, ingredient1 in enumerate(ingredients):
           for ingredient2 in ingredients[i + 1:]:
               cooccurrence_matrix[ingredient_indices[ingredient1], ingredient_indices[ingredient2]] += 1
               cooccurrence_matrix[ingredient_indices[ingredient2], ingredient_indices[ingredient1]] += 1

   return pd.DataFrame(cooccurrence_matrix, index=unique_ingredients, columns=unique_ingredients)
#绘制食谱名称长度的箱型图
def draw_recipe_name_length_box_plot(recipe_names, filename):
   name_lengths = [len(name) for name in recipe_names]
   plt.boxplot(name_lengths)
   plt.xlabel('食谱名字长度')
   plt.ylabel('分布')
   plt.title('食谱名字长度分布箱型图')
   plt.savefig(filename, dpi=300, bbox_inches='tight')
   plt.close()

#绘制横向柱状图
def draw_horizontal_bar_chart(ingredient_counts, filename):
   top_ingredients = ingredient_counts.most_common(10)

   matplotlib.rcParams['font.sans-serif'] = ['SimHei']
   matplotlib.rcParams['axes.unicode_minus'] = False

   labels, heights = zip(*top_ingredients)
   indices = np.arange(len(labels))

   plt.barh(indices, heights, color='steelblue', alpha=0.7)

   plt.yticks(indices, labels)

   plt.xlabel('出现次数')
   plt.ylabel('食材')
   plt.title('食材出现次数横向柱状图')

   plt.gca().invert_yaxis()  # 倒序排列，使得次数多的食材在上方

   plt.savefig(filename, dpi=300, bbox_inches='tight')
   plt.close()
def save_data_to_csv(recipe_names, ingredients_list, scores, filename):
   with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
       csv_writer = csv.writer(csvfile)
       csv_writer.writerow(['Recipe Name', 'Ingredients', 'Score'])
       for name, ingredients, score in zip(recipe_names, ingredients_list, scores):
           csv_writer.writerow([name, ', '.join(ingredients), score])
#绘制评分与食材的曲线图
def plot_ratings_ingredients_curve(ingredients, ratings, save_path):
   # 绘制曲线图
   plt.plot(ingredients, ratings)

   # 设置标题和坐标轴标签
   plt.title('评分与食材曲线图')
   plt.xlabel('食材')
   plt.ylabel('评分')

   # 设置美观
   plt.xticks(rotation=45)
   plt.grid(True)

   # 保存图形为文件
   plt.savefig(save_path, dpi=300, bbox_inches='tight')

   # 显示图形
   plt.show()

def plot_ratings_ingredients_curve(ingredients, ratings, save_path):
   # 绘制曲线图
   plt.plot(ingredients, ratings)

   # 设置标题和坐标轴标签
   plt.title('评分与食材曲线图')
   plt.xlabel('食材')
   plt.ylabel('评分')

   # 设置美观
   plt.xticks(rotation=45)
   plt.grid(True)

   # 保存图形为文件
   plt.savefig(save_path, dpi=300, bbox_inches='tight')
   plt.close()
#抓取香哈网数据
def scrape_xiangha_data(start_page, end_page):
    recipe_names = []
    ingredients_list = []
    scores = []

    for page in range(start_page, end_page + 1):
        url = f'https://www.xiangha.com/caipu/c-jiachang/hot-{page}/'
        res = requests.get(url, headers=headers)
        e = etree.HTML(res.text)

        recipe_names.extend([name.strip() for name in e.xpath("//p[@class='name kw']/a/text()") if name.strip()])

        for p in e.xpath("//div[@class='ins']"):
            ingredients = p.xpath(".//p/text()")
            cleaned_ingredients = [item.strip() for item in ingredients if item.strip()]
            if cleaned_ingredients:
                ingredients_list.append(cleaned_ingredients)

        raw_scores = [item.strip() for item in e.xpath("//div[@class='ins']/p[3]/text()") if item.strip()]
        cleaned_scores = [float(item.split("收藏")[0]) for item in raw_scores if item.endswith("收藏")] # 清洗评分数据
        scores.extend(cleaned_scores)

    return recipe_names, ingredients_list, scores
#绘制散点图
def calculate_average_scores(ingredients_list, scores):
    float_scores = [float(score) for score in scores]
    ingredient_score_map = {}

    for ingredients, score in zip(ingredients_list, float_scores):
        for ingredient in ingredients:
            if ingredient in ingredient_score_map:
                ingredient_score_map[ingredient].append(score)
            else:
                ingredient_score_map[ingredient] = [score]

    ingredient_avg_score_map = {ingredient: np.mean(scores) for ingredient, scores in ingredient_score_map.items()}
    return ingredient_avg_score_map

def plot_ingredient_score_scatter(ingredient_counts, ingredient_avg_score_map, bin_num=30):
    common_ingredients = set(ingredient_counts.keys()).intersection(set(ingredient_avg_score_map.keys()))

    counts = [ingredient_counts[ingredient] for ingredient in common_ingredients]
    avg_scores = [score for score in ingredient_avg_score_map.values() if np.isfinite(score)]

    # 数据分箱
    bins = np.linspace(min(counts), max(counts), bin_num)  # 创建等距的分箱
    digitized = np.digitize(counts, bins)  # 将数据进行分箱
    bin_means = [np.mean(avg_scores[digitized == i]) if np.sum(digitized == i) > 0 else 0 for i in range(1, len(bins))]

    plt.scatter(bins[:-1], bin_means, alpha=0.5)  # 绘制散点图，使用alpha参数调整透明度

    plt.xlabel('Ingredient Counts')
    plt.ylabel('Average Scores')
    plt.title('Scatter Plot of Ingredient Counts vs Average Scores')

    plt.savefig('sandian.png', dpi=300, bbox_inches='tight')  # Change file name here
    plt.close()
#调用 scrape_recipe_data 函数，爬取两个分类下的食谱数据，包括食谱名字、食材列表和评分。
recipe_names_1, ingredients_list_1, scores_1 = scrape_recipe_data(40077, 1, 20)
recipe_names_2, ingredients_list_2, scores_2 = scrape_recipe_data(40076, 1, 20)


# 统计食材出现次数
ingredient_counts = Counter()
for ingredients in ingredients_list_1 + ingredients_list_2:
   for ingredient in ingredients:
       ingredient_counts[ingredient] += 1

# 绘制食材出现次数饼图
draw_pie_chart(ingredient_counts, 'binzhuang.png')

# 合并两个评分列表
combined_scores = scores_1 + scores_2

# 绘制评分出现次数柱状图
draw_bar_chart(combined_scores, 'score.png')

# 绘制评分分布箱型图
draw_box_plot(combined_scores, 'score_diffrent.png')
#绘制直方图
draw_recipe_name_length_box_plot(recipe_names_1 + recipe_names_2, 'recipe_name_length_box_plot.png')
# 调用函数，绘制食材出现次数横向柱状图
draw_horizontal_bar_chart(ingredient_counts, 'ingredient_count_horizontal_bar_chart.png')
# 合并爬取到的数据
combined_recipe_names = recipe_names_1 + recipe_names_2
combined_ingredients_list = ingredients_list_1 + ingredients_list_2
combined_scores = scores_1 + scores_2

# 保存数据到 CSV 文件
save_data_to_csv(combined_recipe_names, combined_ingredients_list, combined_scores, 'meishi.csv')

# 从爬取的数据中提取前十种食材
top_10_ingredients = [ingredient for ingredient, _ in ingredient_counts.most_common(10)]

# 示例评分数据，您可以替换为实际评分数据
ratings = list(range(1, len(top_10_ingredients) + 1))

# 调用函数绘制曲线图并保存为文件
plot_ratings_ingredients_curve(top_10_ingredients, ratings, 'quxian.png')


# 从香哈网抓取数据
xiangha_recipe_names, xiangha_ingredients_list, xiangha_scores = scrape_xiangha_data(1, 40)

# 合并数据
combined_recipe_names += xiangha_recipe_names
combined_ingredients_list += xiangha_ingredients_list
combined_scores += xiangha_scores

# 更新食材计数
for ingredients in xiangha_ingredients_list:
    for ingredient in ingredients:
        ingredient_counts[ingredient] += 1

# 将合并后的数据保存到CSV文件
# calculate average scores for each ingredient
ingredient_avg_score_map = calculate_average_scores(combined_ingredients_list, combined_scores)

# plot scatter plot
plot_ingredient_score_scatter(ingredient_counts, ingredient_avg_score_map)