代码拉取完成,页面将自动刷新
import pandas as pd
import os
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import geopandas as gpd
import shapely.geometry
from pathlib import Path
def star_wars_data():
"""Saves star wars character data with set
datatypes and in pickle format.
"""
df = pd.read_csv(
os.path.join("data", "characters.csv"),
thousands=",",
dtype={
"name": "string",
"height": float,
"mass": float,
"hair_color": "category",
"skin_color": "category",
"eye_color": "category",
"birth_year": "string",
"gender": "category",
"homeworld": "category",
"species": "category",
},
)
df = df.drop(["skin_color", "birth_year"], axis=1)
df.info()
df.to_csv(os.path.join("data", "starwars.csv"))
def tag_visible(element):
if element.parent.name in [
"style",
"script",
"head",
"title",
"meta",
"[document]",
]:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, "html.parser")
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
def save_smith_book():
"""Downloads part of the 'The Wealth of Nations' and saves it."""
html = urllib.request.urlopen(
"https://www.gutenberg.org/files/3300/3300-h/3300-h.htm"
).read()
# Take the book text only
book_text = (
text_from_html(html)
.split("Produced by Colin Muir, and David Widger")[1]
.split("Conclusion of the Chapter.")[0]
)
print(book_text.split("\n")[0])
open(os.path.join("data", "smith_won.txt"), "w").write(book_text)
def prep_river_data():
"""
Download the 10m rivers, lakes, and centerlines from and put in scratch/rivers/
https://www.naturalearthdata.com/downloads/10m-physical-vectors/10m-rivers-lake-centerlines/
TODO: automate download of shapefile
"""
rivers = gpd.read_file(
os.path.join("scratch", "rivers", "ne_10m_rivers_lake_centerlines.shp")
)
uk_bound_box = (-7.57216793459, 49.959999905, 1.68153079591, 58.6350001085)
uk_polygon = shapely.geometry.box(*uk_bound_box, ccw=True)
rivers = rivers[rivers.within(uk_polygon)]
rivers.to_file(os.path.join("data", "geo", "rivers", "rivers.shp"))
def prep_covid_data():
"""
Downloads covid data from uk gov't website and processes it ready for plotting.
"""
# data_url = "https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&metric=newDeaths28DaysByDeathDate&format=csv&release=2021-02-27"
cv_df = pd.read_csv(os.path.join("~", "Downloads", "ltla_2021-02-27.csv"))
cv_df["date"] = pd.to_datetime(cv_df["date"])
cv_df["newDeaths28DaysByDeathDate"] = cv_df["newDeaths28DaysByDeathDate"].astype(
int
)
cv_df["areaCode"] = cv_df["areaCode"].astype("string")
cv_df["areaName"] = cv_df["areaName"].astype("string")
cv_df = cv_df.rename(columns={"areaCode": "LAD20CD", "areaName": "LAD20NM"})
cv_df = cv_df[cv_df["LAD20CD"].str.contains("E09")]
cv_df = (
cv_df.set_index(["date"])
.groupby([pd.Grouper(freq="M"), "LAD20CD", "LAD20NM"])
.sum()
.reset_index()
)
cv_df.to_parquet(os.path.join("data", "geo", "cv_ldn_deaths.parquet"))
def prep_gapminder_data():
"""
Downloaded from Our World in Data:
https://ourworldindata.org/grapher/life-expectancy-vs-gdp-per-capita
"""
df = pd.read_csv(
os.path.join("~", "Downloads", "life-expectancy-vs-gdp-per-capita.csv")
)
df = df[df["Year"] > 1957]
df = df.dropna(
subset=[
"Life expectancy",
"GDP per capita",
"Total population (Gapminder, HYDE & UN)",
]
)
continents_dict = (
df.loc[df["Year"] == 2015, ["Entity", "Continent"]]
.set_index("Entity")
.to_dict()["Continent"]
)
df["Continent"] = df["Entity"].map(continents_dict)
nice_names = {
"Entity": "Country",
"Total population (Gapminder, HYDE & UN)": "Population",
}
df = df.rename(columns=nice_names)
df = df.drop(["Code", "145446-annotations"], axis=1)
df = df[df["Country"] != "World"]
df.to_csv(Path("data/owid_gapminder.csv"), index=False)
def prep_air_quality_data():
# first download data from Air Quality Historical Data Platform
df = pd.read_csv(Path("/Users/aet/Downloads/beijing-air-quality.csv"))
df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y")
df = df.set_index("date")
df = df.sort_index()
# make 7 day rolling
df = df.rolling(7).mean()
df.to_csv(Path("data/beijing_pm.csv"))
if __name__ == "__main__":
prep_river_data()
star_wars_data()
save_smith_book()
prep_gapminder_data()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。