Commit b4a4a8e3 authored by Robert Behling's avatar Robert Behling
Browse files

cleaning up of the file content

parent 00fcee67
...@@ -5,14 +5,23 @@ stages: ...@@ -5,14 +5,23 @@ stages:
before_script: before_script:
- pip install pipenv - pip install pipenv
- pipenv install -d - pipenv install --dev
Python:pylint: test:pylint:
stage: test stage: test
script: script:
- pipenv run pylint --rcfile test/linting/pylintrc src/*.py - pipenv run pylint --rcfile test/linting/pylintrc src/*.py
only:
changes:
- "**/*.py"
- "test/linting/pylintrc"
- ".gitlab-ci.yml"
Python:unittests: test:unittest:
stage: test stage: test
script: script:
- pipenv run python -m unittest discover test/unittests - pipenv run python -m unittest discover test/unittest
only:
changes:
- "**/*.py"
- ".gitlab-ci.yml"
\ No newline at end of file
"""
module docstring: description of what the .py file (module) does. here comes the module docstring:
This script analyses a data set about astronauts and creates plots as a result.
"""
#1. import standard imports
from datetime import date
from pathlib import Path
from os import makedirs
#2. import additional third party imports (not included in python)
import matplotlib.pyplot as plt
import pandas as pd
#3. import local files/libraries
#within these classes import it alphabetically!
plt.style.use("ggplot")
_ASTRONAUT_DATA = "data/astronauts.json"
_OUTPUT_PATH = "results"
def calculate_age(born: pd.Timestamp) -> int:
"""
Calculates an age from a date.
:param born: pandas.Timestamp
:return: years as integer
"""
today = date.today()
return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
def is_alive(date_of_death) -> bool: # the type is not defined, because it can be pd.Timesrtamp AND Null
"""
Checks if 'date_of_death' exists or not
:param date_of_death: pandas.Timestamp or NaTType
:return: bool
"""
if pd.isnull(date_of_death):
return True
return False
def died_with_age(row: pd.Series): # the output type is not defined, because it can be pd.Timestamp AND Null
"""
Calculates age from birthdate and date_of_death
:param row: pandas.Series with the columns ´birthdate' and ´date_of_death'
:return: int, if person died, none otherwise
"""
if pd.isnull(row["date_of_death"]):
return None
born = row["birthdate"]
today = row["date_of_death"]
return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
# Data preparation functions
##
def prepare_data_set(data_frame: pd.DataFrame) -> pd.DataFrame:
"""
Prepares the raw data by:
- dropping NaN's
- setting data types
- calculating some extra columns
Args:
data_frame: A pandas DataFrame.
Returns:
A pandas DataFrame with preprocessed data.
"""
data_frame = rename_columns(data_frame)
data_frame = data_frame.set_index("astronaut_id")
# Set pandas dtypes for columns with date or time
data_frame = data_frame.dropna(subset=["time_in_space"])
data_frame["time_in_space"] = data_frame["time_in_space"].astype(int)
data_frame["time_in_space"] = pd.to_timedelta(data_frame["time_in_space"], unit="m")
data_frame["birthdate"] = pd.to_datetime(data_frame["birthdate"])
data_frame["date_of_death"] = pd.to_datetime(data_frame["date_of_death"])
data_frame.sort_values("birthdate", inplace=True)
# Calculate extra columns from the original data
data_frame["time_in_space_D"] = data_frame["time_in_space"].astype("timedelta64[D]")
data_frame["alive"] = data_frame["date_of_death"].apply(is_alive)
data_frame["age"] = data_frame["birthdate"].apply(calculate_age)
data_frame["died_with_age"] = data_frame.apply(died_with_age, axis=1)
return data_frame
def rename_columns(data_frame):
"""
The original column naming in the data set is not useful
for programming with pandas. So we rename it.
"""
name_mapping = {
"astronaut": "astronaut_id",
"astronautLabel": "name",
"birthplaceLabel": "birthplace",
"sex_or_genderLabel": "sex_or_gender",
}
data_frame = data_frame.rename(index=str, columns=name_mapping)
return data_frame
##
# Plot functions
##
def create_time_of_x_in_space(data_frame, filename, title):
"""
This function generated a plot with the summed up time of 'living beings'
in space over the years by their birthday's.
"""
reduced_data_frame = data_frame[["birthdate", "time_in_space", "time_in_space_D"]].copy()
reduced_data_frame["accumulated_time_in_minutes"] = reduced_data_frame["time_in_space"].cumsum()
reduced_data_frame["accumulated_time_in_days"] = reduced_data_frame["time_in_space_D"].cumsum()
axs = reduced_data_frame.plot(x="birthdate", y="accumulated_time_in_days")
axs.set_title(title)
axs.set_xlabel("Years ")
axs.set_ylabel("t in days")
save(axs.get_figure(), filename)
def create_age_histogram(age_data_frame, died_data_frame):
"""
The function generates a combined histogram of astronauts
in the categories 'age at dead' and 'age alive'.
"""
fig, axs = plt.subplots(1, 1)
axs.hist(
[died_data_frame["died_with_age"], age_data_frame["age"]],
bins=70,
range=(31, 100),
stacked=True,
)
axs.set_xlabel("Age")
axs.set_ylabel("Number of astronauts")
axs.set_title("Dead vs. Alive astronauts")
save(fig, "combined_histogram.png")
def create_age_boxplot(age_data_frame, died_data_frame):
"""
The function generates a boxplot of astronauts age distribution
in the categories dead and alive.
"""
fig, axs = plt.subplots(1, 1)
axs.boxplot([died_data_frame["died_with_age"], age_data_frame["age"]])
axs.set_title("Age distribution; Dead vs. Alive astronauts")
axs.set_xlabel("Category")
plt.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"])
axs.set_ylabel("Age")
save(fig, "boxplot.png")
def save(fig: plt.Figure, filename: str):
"""
Saves a matplotlib Figure to a file. It overwrites existing files with the same filename.
Args:
fig: matplotlib.pyplot.Figure
filename: str
"""
fig.savefig(Path(_OUTPUT_PATH).resolve() / Path(filename))
def perform_analysis():
""" Glues data preparation and plotting. """
# Set up directory structure and preprocess data
makedirs(_OUTPUT_PATH, exist_ok=True)
data_frame = pd.read_json(Path(_ASTRONAUT_DATA).resolve())
data_frame = prepare_data_set(data_frame)
# Male humans in space
data_frame_male = data_frame.loc[
data_frame["sex_or_gender"] == "male", ["birthdate", "time_in_space", "time_in_space_D"]].copy()
create_time_of_x_in_space(
data_frame_male,
"male_humans_in_space.png",
"Total time male humans have spend in space",
)
# Female humans in space
data_frame_female = data_frame.loc[
data_frame["sex_or_gender"] == "female",
["birthdate", "time_in_space", "time_in_space_D"],
].copy()
create_time_of_x_in_space(
data_frame_female,
"female_humans_in_space.png",
"Total time female humans have spend in space",
)
# Humans in space
create_time_of_x_in_space(
data_frame, "humans_in_space.png", "Total time humans have spend in space"
)
# Dead and alive astronauts analysis
died_data_frame = data_frame.loc[data_frame["alive"] == 0, ["died_with_age"]].copy()
age_data_frame = data_frame.loc[data_frame["alive"] == 1, ["age"]].copy()
# Combined histogram of dead and alive astronauts
create_age_histogram(age_data_frame, died_data_frame)
# Box plots of dead vs alive astronauts
create_age_boxplot(age_data_frame, died_data_frame)
# Main entry point
if __name__ == "__main__":
perform_analysis()
#def save(fig: plt.Figure, filename: str) -> None:
#"""
#Saves a matplotlib Figure to a file. It overwrites existing files.
#:param fig: matplotlib.pyplot.Figure
#:param filename: str
#"""
#fig.savefig(Path(_OUTPUT_PATH).resolve() / Path(filename)) # similar to os.pat.join()
#df = pd.read_json(_ASTRONAUT_DATA)
#df = df.rename(index=str, columns={"astronaut": "astronaut_id", "astronautLabel": "name","birthplaceLabel": "birthplace","sex_or_genderLabel": "sex_or_gender"})
#df = df.set_index("astronaut_id")
#df = df.dropna(subset=["time_in_space"])
#df["time_in_space"] = df["time_in_space"].astype(int)
#df["time_in_space"] = pd.to_timedelta(df["time_in_space"], unit="m")
#df["time_in_space_D"] = df["time_in_space"].astype("timedelta64[D]")
#df["birthdate"] = pd.to_datetime(df["birthdate"])
#df["date_of_death"] = pd.to_datetime(df["date_of_death"])
#df.sort_values("birthdate", inplace=True)
#df["alive"] = df["date_of_death"].apply(is_alive)
#df["age"] = df["birthdate"].apply(calculate_age)
#df["died_with_age"] = df.apply(died_with_age, axis=1)
## Male humans in space
#df_male = df.loc[df["sex_or_gender"] == "male", ["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df = df_male[["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum()
#reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum()
#reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
#plt.title("Total time male humans have spend in space")
#plt.xlabel("Years")
#plt.ylabel("t in days")
#fig = plt.gcf()
#save(fig, "male_humans_in_space.png")
#fig.savefig()
## Female humans in space
#df_female = df.loc[df["sex_or_gender"] == "female", ["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df = df_female[["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum()
#reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum()
#reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
#plt.title("Total time female humans have spend in space")
#plt.xlabel("Years")
#plt.ylabel("t in days")
#fig = plt.gcf()
#fig.savefig("female_humans_in_space.png")
## Humans in space
#reduced_df = df[["birthdate", "time_in_space", "time_in_space_D"]].copy()
#reduced_df["accumulated_time_in_minutes"] = reduced_df["time_in_space"].cumsum()
#reduced_df["accumulated_time_in_days"] = reduced_df["time_in_space_D"].cumsum()
#reduced_df.plot(x="birthdate", y="accumulated_time_in_days")
#plt.title("Total time humans have spend in space")
#plt.xlabel("Years")
#plt.ylabel("t in days")
#fig = plt.gcf()
#fig.savefig("humans_in_space.png")
#died_df = df.loc[df["alive"] == 0, ["died_with_age"]].copy()
#age_df = df.loc[df["alive"] == 1, ["age"]].copy()
## Combined Histogram of dead and alive astronauts
#fig, axs = plt.subplots(1, 1)
#axs.hist([died_df["died_with_age"], age_df["age"]], bins=70, range=(31, 100), stacked=True)
#axs.set_xlabel("Age")
#axs.set_ylabel("Number of astronauts")
#axs.set_title("Dead vs. Alive astronauts")
#fig.savefig("combined_histogram.png")
## Box plots of dead vs alive astronauts
#fig, axs = plt.subplots(1, 1)
#axs.boxplot([died_df["died_with_age"], age_df["age"]])
#axs.set_title("Age distribution; Dead vs. Alive astronauts")
#axs.set_xlabel("Category")
#plt.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"])
#axs.set_ylabel("Age")
#fig.savefig("boxplot.png")
"""
testing
"""
import unittest # framework for testing
import pandas as pd
from src import astronaut_analysis # only works if calling the test from the root directory (cd) root directory is should be working directory
class TestCalculateAge(unittest.TestCase):
def test_return_type(self):
birth_date = pd.Timestamp('1950-01-01') # example data with correct type (set up of the test)
self.assertEqual(type(astronaut_analysis.calculate_age(birth_date)) ,int)
def test_arg_type(self):
birth_date = '1950-01-01' # example data with wrong data. set up which should fail
with self.assertRaises(TypeError):
astronaut_analysis.calculate_age(birth_date)
### here we could test data types also for all other datatypes
# Main entry point
if __name__ == "__main__":
unittest.main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment