Commit 44d1fc8b authored by Robert Behling's avatar Robert Behling
Browse files

cleaned code

parent f7fc09c5
from os import path, makedirs """
import pandas as pd module docstring: description of what the .py file (module) does. here comes the module docstring:
import matplotlib.pyplot as plt
This script analyses a data set about astronauts and creates plots as a result.
"""
#1. import standard imports
from datetime import date from datetime import date
from pathlib import Path
from os import makedirs
#2. import additional third party imports (not included in python)
import matplotlib.pyplot as plt
import pandas as pd
#3. import local files/libraries
#within these classes import it alphabetically!
plt.style.use("ggplot")
_ASTRONAUT_DATA = "data/astronauts.json"
_OUTPUT_PATH = "results"
def calculate_age(born): def calculate_age(born: pd.Timestamp) -> int:
"""
Calculates an age from a date.
:param born: pandas.Timestamp
:return: years as integer
"""
today = date.today() today = date.today()
return today.year - born.year - ((today.month, today.day) < (born.month, born.day)) return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
def is_alive(date_of_death):
def is_alive(date_of_death) -> bool: # the type is not defined, because it can be pd.Timesrtamp AND Null
"""
Checks if 'date_of_death' exists or not
:param date_of_death: pandas.Timestamp or NaTType
:return: bool
"""
if pd.isnull(date_of_death): if pd.isnull(date_of_death):
return True return True
return False return False
def died_with_age(row):
def died_with_age(row: pd.Series): # the output type is not defined, because it can be pd.Timestamp AND Null
"""
Calculates age from birthdate and date_of_death
:param row: pandas.Series with the columns ´birthdate' and ´date_of_death'
:return: int, if person died, none otherwise
"""
if pd.isnull(row["date_of_death"]): if pd.isnull(row["date_of_death"]):
return None return None
born = row["birthdate"] born = row["birthdate"]
today = row["date_of_death"] today = row["date_of_death"]
return today.year - born.year - ((today.month, today.day) < (born.month, born.day)) return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
plt.style.use("ggplot")
df = pd.read_json("data/astronauts.json") # Data preparation functions
df = df.rename(index=str, columns={"astronaut": "astronaut_id", "astronautLabel": "name","birthplaceLabel": "birthplace","sex_or_genderLabel": "sex_or_gender"}) ##
def prepare_data_set(data_frame: pd.DataFrame) -> pd.DataFrame:
df = df.set_index("astronaut_id") """
df = df.dropna(subset=["time_in_space"]) Prepares the raw data by:
df["time_in_space"] = df["time_in_space"].astype(int) - dropping NaN's
df["time_in_space"] = pd.to_timedelta(df["time_in_space"], unit="m") - setting data types
df["time_in_space_D"] = df["time_in_space"].astype("timedelta64[D]") - calculating some extra columns
df["birthdate"] = pd.to_datetime(df["birthdate"])
df["date_of_death"] = pd.to_datetime(df["date_of_death"]) Args:
df.sort_values("birthdate", inplace=True) data_frame: A pandas DataFrame.
df["alive"] = df["date_of_death"].apply(is_alive)
df["age"] = df["birthdate"].apply(calculate_age) Returns:
df["died_with_age"] = df.apply(died_with_age, axis=1) A pandas DataFrame with preprocessed data.
"""
# Male humans in space data_frame = rename_columns(data_frame)
df_male = df.loc[df["sex_or_gender"] == "male", ["birthdate", "time_in_space", "time_in_space_D"]].copy() data_frame = data_frame.set_index("astronaut_id")
reduced_df = df_male[["birthdate", "time_in_space", "time_in_space_D"]].copy()
reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum() # Set pandas dtypes for columns with date or time
reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum() data_frame = data_frame.dropna(subset=["time_in_space"])
reduced_df.plot(x="birthdate", y="accumulated_time_in_days") data_frame["time_in_space"] = data_frame["time_in_space"].astype(int)
plt.title("Total time male humans have spend in space") data_frame["time_in_space"] = pd.to_timedelta(data_frame["time_in_space"], unit="m")
plt.xlabel("Years") data_frame["birthdate"] = pd.to_datetime(data_frame["birthdate"])
plt.ylabel("t in days") data_frame["date_of_death"] = pd.to_datetime(data_frame["date_of_death"])
fig = plt.gcf() data_frame.sort_values("birthdate", inplace=True)
fig.savefig("male_humans_in_space.png")
# Calculate extra columns from the original data
# Female humans in space data_frame["time_in_space_D"] = data_frame["time_in_space"].astype("timedelta64[D]")
df_female = df.loc[df["sex_or_gender"] == "female", ["birthdate", "time_in_space", "time_in_space_D"]].copy() data_frame["alive"] = data_frame["date_of_death"].apply(is_alive)
reduced_df = df_female[["birthdate", "time_in_space", "time_in_space_D"]].copy() data_frame["age"] = data_frame["birthdate"].apply(calculate_age)
reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum() data_frame["died_with_age"] = data_frame.apply(died_with_age, axis=1)
reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum() return data_frame
reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
plt.title("Total time female humans have spend in space") def rename_columns(data_frame):
plt.xlabel("Years") """
plt.ylabel("t in days") The original column naming in the data set is not useful
fig = plt.gcf() for programming with pandas. So we rename it.
fig.savefig("female_humans_in_space.png") """
# Humans in space name_mapping = {
reduced_df = df[["birthdate", "time_in_space", "time_in_space_D"]].copy() "astronaut": "astronaut_id",
reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum() "astronautLabel": "name",
reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum() "birthplaceLabel": "birthplace",
reduced_df.plot(x="birthdate", y="accumulated_time_in_days") "sex_or_genderLabel": "sex_or_gender",
plt.title("Total time humans have spend in space") }
plt.xlabel("Years") data_frame = data_frame.rename(index=str, columns=name_mapping)
plt.ylabel("t in days") return data_frame
fig = plt.gcf() ##
fig.savefig("humans_in_space.png") # Plot functions
##
died_df = df.loc[df["alive"] == 0, ["died_with_age"]].copy() def create_time_of_x_in_space(data_frame, filename, title):
age_df = df.loc[df["alive"] == 1, ["age"]].copy() """
This function generated a plot with the summed up time of 'living beings'
# Combined Histogram of dead and alive astronauts in space over the years by their birthday's.
fig, axs = plt.subplots(1, 1) """
axs.hist([died_df["died_with_age"], age_df["age"]], bins=70, range=(31, 100), stacked=True)
axs.set_xlabel("Age") reduced_data_frame = data_frame[["birthdate", "time_in_space", "time_in_space_D"]].copy()
axs.set_ylabel("Number of astronauts") reduced_data_frame["accumulated_time_in_minutes"] = reduced_data_frame["time_in_space"].cumsum()
axs.set_title("Dead vs. Alive astronauts") reduced_data_frame["accumulated_time_in_days"] = reduced_data_frame["time_in_space_D"].cumsum()
fig.savefig("combined_histogram.png") axs = reduced_data_frame.plot(x="birthdate", y="accumulated_time_in_days")
axs.set_title(title)
# Box plots of dead vs alive astronauts axs.set_xlabel("Years ")
fig, axs = plt.subplots(1, 1) axs.set_ylabel("t in days")
axs.boxplot([died_df["died_with_age"], age_df["age"]]) save(axs.get_figure(), filename)
axs.set_title("Age distribution; Dead vs. Alive astronauts")
axs.set_xlabel("Category") def create_age_histogram(age_data_frame, died_data_frame):
plt.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"]) """
axs.set_ylabel("Age") The function generates a combined histogram of astronauts
fig.savefig("boxplot.png") in the categories 'age at dead' and 'age alive'.
"""
fig, axs = plt.subplots(1, 1)
axs.hist(
[died_data_frame["died_with_age"], age_data_frame["age"]],
bins=70,
range=(31, 100),
stacked=True,
)
axs.set_xlabel("Age")
axs.set_ylabel("Number of astronauts")
axs.set_title("Dead vs. Alive astronauts")
save(fig, "combined_histogram.png")
def create_age_boxplot(age_data_frame, died_data_frame):
"""
The function generates a boxplot of astronauts age distribution
in the categories dead and alive.
"""
fig, axs = plt.subplots(1, 1)
axs.boxplot([died_data_frame["died_with_age"], age_data_frame["age"]])
axs.set_title("Age distribution; Dead vs. Alive astronauts")
axs.set_xlabel("Category")
plt.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"])
axs.set_ylabel("Age")
save(fig, "boxplot.png")
def save(fig: plt.Figure, filename: str):
"""
Saves a matplotlib Figure to a file. It overwrites existing files with the same filename.
Args:
fig: matplotlib.pyplot.Figure
filename: str
"""
fig.savefig(Path(_OUTPUT_PATH).resolve() / Path(filename))
def perform_analysis():
""" Glues data preparation and plotting. """
# Set up directory structure and preprocess data
makedirs(_OUTPUT_PATH, exist_ok=True)
data_frame = pd.read_json(Path(_ASTRONAUT_DATA).resolve())
data_frame = prepare_data_set(data_frame)
# Male humans in space
data_frame_male = data_frame.loc[
data_frame["sex_or_gender"] == "male", ["birthdate", "time_in_space", "time_in_space_D"]].copy()
create_time_of_x_in_space(
data_frame_male,
"male_humans_in_space.png",
"Total time male humans have spend in space",
)
# Female humans in space
data_frame_female = data_frame.loc[
data_frame["sex_or_gender"] == "female",
["birthdate", "time_in_space", "time_in_space_D"],
].copy()
create_time_of_x_in_space(
data_frame_female,
"female_humans_in_space.png",
"Total time female humans have spend in space",
)
# Humans in space
create_time_of_x_in_space(
data_frame, "humans_in_space.png", "Total time humans have spend in space"
)
# Dead and alive astronauts analysis
died_data_frame = data_frame.loc[data_frame["alive"] == 0, ["died_with_age"]].copy()
age_data_frame = data_frame.loc[data_frame["alive"] == 1, ["age"]].copy()
# Combined histogram of dead and alive astronauts
create_age_histogram(age_data_frame, died_data_frame)
# Box plots of dead vs alive astronauts
create_age_boxplot(age_data_frame, died_data_frame)
# Main entry point
if __name__ == "__main__":
perform_analysis()
#def save(fig: plt.Figure, filename: str) -> None:
#"""
#Saves a matplotlib Figure to a file. It overwrites existing files.
#:param fig: matplotlib.pyplot.Figure
#:param filename: str
#"""
#fig.savefig(Path(_OUTPUT_PATH).resolve() / Path(filename)) # similar to os.pat.join()
#df = pd.read_json(_ASTRONAUT_DATA)
#df = df.rename(index=str, columns={"astronaut": "astronaut_id", "astronautLabel": "name","birthplaceLabel": "birthplace","sex_or_genderLabel": "sex_or_gender"})
#df = df.set_index("astronaut_id")
#df = df.dropna(subset=["time_in_space"])
#df["time_in_space"] = df["time_in_space"].astype(int)
#df["time_in_space"] = pd.to_timedelta(df["time_in_space"], unit="m")
#df["time_in_space_D"] = df["time_in_space"].astype("timedelta64[D]")
#df["birthdate"] = pd.to_datetime(df["birthdate"])
#df["date_of_death"] = pd.to_datetime(df["date_of_death"])
#df.sort_values("birthdate", inplace=True)
#df["alive"] = df["date_of_death"].apply(is_alive)
#df["age"] = df["birthdate"].apply(calculate_age)
#df["died_with_age"] = df.apply(died_with_age, axis=1)
## Male humans in space
#df_male = df.loc[df["sex_or_gender"] == "male", ["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df = df_male[["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum()
#reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum()
#reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
#plt.title("Total time male humans have spend in space")
#plt.xlabel("Years")
#plt.ylabel("t in days")
#fig = plt.gcf()
#save(fig, "male_humans_in_space.png")
#fig.savefig()
## Female humans in space
#df_female = df.loc[df["sex_or_gender"] == "female", ["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df = df_female[["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum()
#reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum()
#reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
#plt.title("Total time female humans have spend in space")
#plt.xlabel("Years")
#plt.ylabel("t in days")
#fig = plt.gcf()
#fig.savefig("female_humans_in_space.png")
## Humans in space
#reduced_df = df[["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum()
#reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum()
#reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
#plt.title("Total time humans have spend in space")
#plt.xlabel("Years")
#plt.ylabel("t in days")
#fig = plt.gcf()
#fig.savefig("humans_in_space.png")
#died_df = df.loc[df["alive"] == 0, ["died_with_age"]].copy()
#age_df = df.loc[df["alive"] == 1, ["age"]].copy()
## Combined Histogram of dead and alive astronauts
#fig, axs = plt.subplots(1, 1)
#axs.hist([died_df["died_with_age"], age_df["age"]], bins=70, range=(31, 100), stacked=True)
#axs.set_xlabel("Age")
#axs.set_ylabel("Number of astronauts")
#axs.set_title("Dead vs. Alive astronauts")
#fig.savefig("combined_histogram.png")
## Box plots of dead vs alive astronauts
#fig, axs = plt.subplots(1, 1)
#axs.boxplot([died_df["died_with_age"], age_df["age"]])
#axs.set_title("Age distribution; Dead vs. Alive astronauts")
#axs.set_xlabel("Category")
#plt.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"])
#axs.set_ylabel("Age")
#fig.savefig("boxplot.png")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment