astronauts-analysis.py 5.05 KB
Newer Older
1 2 3 4 5 6
"""
This script analysis a data set about astronauts and creates different
plots as result.
"""


Tobias Schlauch's avatar
Tobias Schlauch committed
7
from datetime import date
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
from os import makedirs
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd


plt.style.use("ggplot")
_ASTRONAUT_DATA = "data/astronauts.json"
_OUTPUT_PATH = "results"


##
# Data preparation functions
##
def prepare_data_set(df):
    df = rename_columns(df)
    df = df.set_index("astronaut_id")

    # Set pandas dtypes for columns with date or time
    df = df.dropna(subset=["time_in_space"])
    df["time_in_space"] = df["time_in_space"].astype(int)
    df["time_in_space"] = pd.to_timedelta(df["time_in_space"], unit="m")
    df["birthdate"] = pd.to_datetime(df["birthdate"])
    df["date_of_death"] = pd.to_datetime(df["date_of_death"])
    df.sort_values("birthdate", inplace=True)

    # Calculate extra columns from the original data
    df["time_in_space_D"] = df["time_in_space"].astype("timedelta64[D]")
    df["alive"] = df["date_of_death"].apply(is_alive)
    df["age"] = df["birthdate"].apply(calculate_age)
    df["died_with_age"] = df.apply(died_with_age, axis=1)
    return df


def rename_columns(df):
    """
    The original column naming in the data set is not useful
    for programming with pandas. So we rename it.
    """

    name_mapping = {
        "astronaut": "astronaut_id",
        "astronautLabel": "name",
        "birthplaceLabel": "birthplace",
        "sex_or_genderLabel": "sex_or_gender",
    }
    df = df.rename(index=str, columns=name_mapping)
    return df
Tobias Schlauch's avatar
Tobias Schlauch committed
57 58 59 60 61 62 63


def is_alive(date_of_death):
    if pd.isnull(date_of_death):
        return True
    return False

64 65 66 67 68 69

def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))


Tobias Schlauch's avatar
Tobias Schlauch committed
70 71 72 73 74 75 76
def died_with_age(row):
    if pd.isnull(row["date_of_death"]):
        return None
    born = row["birthdate"]
    today = row["date_of_death"]
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137

##
# Plot functions
##
def create_time_of_x_in_space(df, filename, title):
    """
    This function generated a plot with the summed up time of 'living beings'
    in space over the years by their birthday's.
    """

    reduced_df = df[["birthdate", "time_in_space", "time_in_space_D"]].copy()
    reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum()
    reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum()
    axs = reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
    axs.set_title(title)
    axs.set_xlabel("Years ")
    axs.set_ylabel("t in days")
    save(axs.get_figure(), filename)


def create_age_histogram(age_df, died_df):
    """
    The function generates a combined histogram of astronauts
    in the categories 'age at dead' and 'age alive'.
    """

    fig, axs = plt.subplots(1, 1)
    axs.hist(
        [died_df["died_with_age"], age_df["age"]],
        bins=70,
        range=(31, 100),
        stacked=True,
    )
    axs.set_xlabel("Age")
    axs.set_ylabel("Number of astronauts")
    axs.set_title("Dead vs. Alive astronauts")
    save(fig, "combined_histogram.png")


def create_age_boxplot(age_df, died_df):
    """
    The function generates a boxplot of astronauts age distribution
    in the categories dead and alive.
    """

    fig, axs = plt.subplots(1, 1)
    axs.boxplot([died_df["died_with_age"], age_df["age"]])
    axs.set_title("Age distribution; Dead vs. Alive astronauts")
    axs.set_xlabel("Category")
    plt.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"])
    axs.set_ylabel("Age")
    save(fig, "boxplot.png")


def save(fig, filename):
    fig.savefig(Path(_OUTPUT_PATH).resolve() / Path(filename))


def perform_analysis():
    """ Glues data preparation and plotting. """

Tobias Schlauch's avatar
Tobias Schlauch committed
138
    # Set up directory structure and preprocess data
139
    makedirs(_OUTPUT_PATH, exist_ok=True)
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
    df = pd.read_json(Path(_ASTRONAUT_DATA).resolve())
    df = prepare_data_set(df)

    # Male humans in space
    df_male = df.loc[
        df["sex_or_gender"] == "male", ["birthdate", "time_in_space", "time_in_space_D"]
    ].copy()
    create_time_of_x_in_space(
        df_male,
        "male_humans_in_space.png",
        "Total time male humans have spend in space",
    )

    # Female humans in space
    df_female = df.loc[
        df["sex_or_gender"] == "female",
        ["birthdate", "time_in_space", "time_in_space_D"],
    ].copy()
    create_time_of_x_in_space(
        df_female,
        "female_humans_in_space.png",
        "Total time female humans have spend in space",
    )

    # Humans in space
    create_time_of_x_in_space(
        df, "humans_in_space.png", "Total time humans have spend in space"
    )

    # Dead and alive astronauts analysis
    died_df = df.loc[df["alive"] == 0, ["died_with_age"]].copy()
    age_df = df.loc[df["alive"] == 1, ["age"]].copy()

    # Combined histogram of dead and alive astronauts
    create_age_histogram(age_df, died_df)

    # Box plots of dead vs alive astronauts
    create_age_boxplot(age_df, died_df)


# Main entry point
if __name__ == "__main__":
    perform_analysis()