diff --git a/src/T09_GroupBy.py b/src/T09_GroupBy.py index 51f8972..5702ea1 100644 --- a/src/T09_GroupBy.py +++ b/src/T09_GroupBy.py @@ -35,6 +35,11 @@ print(grouped_donations.loc["Student"].loc[["Hamburg", "Köln"]]) # zeile students ist wieder ein dataframe students = grouped_donations.loc["Student"] print(students.loc["Hamburg"]) +print(grouped_donations) +print(grouped_donations.idxmin()) +print(grouped_donations.loc[grouped_donations.idxmin()]) +print(grouped_donations.loc["Student"].loc["Hamburg"]) + print("-"*100) # doppelte indizes info = grouped_donations.unstack() diff --git a/src/T10_ex_Energy.py b/src/T10_ex_Energy.py index b92ab9f..80898fa 100644 --- a/src/T10_ex_Energy.py +++ b/src/T10_ex_Energy.py @@ -7,10 +7,60 @@ print(energy_df.columns) # Green, Non-Green # 1) Wiewiel erneuerbare bzw nicht-erneuerbare energie wurde insgesamt produziert +total_energy_production = energy_df[["Generation_TWh", "Energy_Type"]].groupby("Energy_Type").sum() +print(total_energy_production) + # 2) Pro Jahr (unstack) +yearly_production = energy_df[["Year", "Generation_TWh", "Energy_Type"]].groupby(["Year", "Energy_Type"]).sum() +yearly_production = yearly_production.unstack() +print(yearly_production) + # 3) jährlich nach Energy_Source angeben -# 4) welche energiequelle hat den größten/kleinsten wachstum (idxmax) +yearly_production = energy_df[["Year", "Generation_TWh", "Energy_Source"]].groupby(["Year", "Energy_Source"]).sum() +yearly_production = yearly_production.unstack() +print(yearly_production) + +# 4) welche energiequelle hat den größten/kleinsten wachstum (idxmax) +yearly_production = yearly_production["Generation_TWh"] +print(yearly_production) +# a = yearly_production["Biomass"] +# a.sort_values(inplace=True) Verboten, da a eine View auf yearly_production ist! +print("-"*100) +diff = yearly_production.loc[2024] - yearly_production.loc[2019] +sorted_diffs = diff.sort_values() +print(diff) +print(sorted_diffs) + +# gößter wachtumg +print(f"Größter Wachstum: {diff.idxmax()}, {diff.max()}") +print(f"Größter Wachstum: {sorted_diffs.index[-1]}, {sorted_diffs.iloc[-1]}") +print(f"Kleinster Wachstum: {diff.idxmin()}, {diff.min()}") +print(f"Kleinster Wachstum: {sorted_diffs.index[0]}, {sorted_diffs.iloc[0]}") + # 5) Prozentual +percentages = yearly_production.divide(yearly_production.sum(axis=1), axis=0) +print(percentages) +print(percentages.mean(axis=0)) +perc_diff = percentages.loc[2024] - percentages.loc[2019] +perc_diff.sort_values(inplace=True) +print(perc_diff) # Monatlichen Verlauf (Jan 2019 und Jan 2020 sind verschiedene) -# In welchen monaten wurde mehr grüner als nicht-grüner strom produziert \ No newline at end of file +verlauf = energy_df[["Year", "Month", "Energy_Type", "Generation_TWh"]].groupby(["Year", "Month", "Energy_Type"]).sum().unstack() +print(verlauf) + +# In welchen monaten wurde mehr grüner als nicht-grüner strom produziert +print(verlauf[verlauf["Generation_TWh", "Green"] > verlauf["Generation_TWh", "Non-Green"]]) + +verlauf = verlauf["Generation_TWh"] +print(verlauf["Green"] > verlauf["Non-Green"]) + +# Biomass, CoalHard, +#2019-1 100, 2321, +#2019-2 100, 2321, + +# Green, NonGreen, +#2019-1 100, 2321, +#2019-2 110, 2121, +# .... +#2024-2 2313, 111, diff --git a/src/T11_Pivotieren.py b/src/T11_Pivotieren.py new file mode 100644 index 0000000..2983d6d --- /dev/null +++ b/src/T11_Pivotieren.py @@ -0,0 +1,76 @@ +import pandas as pd +from pandas.core.dtypes.missing import construct_1d_array_from_inferred_fill_value + +data = { + 'Product': ['Fancy Chair', 'Fancy Chair', 'Luxury Sofa', 'Designer Table', 'Luxury Sofa'], + 'Color': ['Blue', 'Green', 'Blue', 'Green', 'Red'], + 'Customer Price': [2345.89, 2390.50, 1820.00, 3100.00, 2750.00], + 'Non-Customer Price': [2445.89, 2495.50, 1980.00, 3400.00, 2850.00] +} + +df = pd.DataFrame(data) +print(df) + +# pivot erlaubt keine Duplikate! +pivoted_df = df.pivot(index="Product", + columns="Color", + values="Non-Customer Price") + +print(pivoted_df) + +pivoted_df = df.pivot(index="Product", + columns="Color", + values=["Non-Customer Price", "Customer Price"]) + +print(pivoted_df) + +beverages = pd.read_csv("../data/beverages.csv") +beverages["Day"] = (["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"] * 35)[:103] +print(beverages) + +# eine aggfunc für duplikate mitgeben +# pivot_table ~= gorupby().aggfunc().unstack() +coffe_pivot = beverages.pivot_table( + index="Name", + columns="Day", + values="Coffee", + aggfunc="mean", + fill_value=0 +).round(1) +print(coffe_pivot) +# als groupby +coffees = beverages[["Name", "Day", "Coffee"]].groupby(["Name", "Day"]).mean().unstack().round(1) +print(coffees) +coffees[coffees.isna()] = 0.0 +print(coffees) +# 1. Energiedaten +# ["Year_Quarter"] = ["Year"].astype(str) + ["Quarter"] # 2019 + "Q1" +# Pro Type für jedes Quartal die Produktion gruppieren +# Pivot -> Zeilen: (Jahr, Q1) Spalten: Biomass, WInd + +# df.index <- name der index-spalte +print("\n"*3) +energy_df = pd.read_csv("../data/germany_energy_mix_2019_2024.csv") +# 1) Neue spalte +energy_df["Year Quarter"] = energy_df["Year"].astype(str) + " " + energy_df['Quarter'] +# drop: axis=0 die zeilen namens [Year, quarter] gelöscht +# axis=1 die spalten namens [Year, quarter] gelöscht +energy_df.drop(["Year", "Quarter"], axis=1, inplace=True) +print(energy_df) + +# 1) Mit 'Year Quarter' gruppieren +quarterly_data = energy_df[["Year Quarter", "Energy_Source", "Generation_TWh"]].groupby( + ["Year Quarter", "Energy_Source"]).sum().unstack() +print(quarterly_data) + +# 2) Pivot_table +quarterly_data = energy_df.pivot_table( + index="Year Quarter", + columns="Energy_Source", + values="Generation_TWh", + aggfunc="sum", + fill_value=0, # für jeden energietyp in jedem quartal ein wert existiert +) +print(quarterly_data) + + diff --git a/src/T12_Datetimes.py b/src/T12_Datetimes.py new file mode 100644 index 0000000..1578f6e --- /dev/null +++ b/src/T12_Datetimes.py @@ -0,0 +1,45 @@ +import pandas as pd + +# jahr-monat-tag +# monat/tag/jahr (us-schreibweise) + +beverages_by_date = pd.read_csv("../data/beverages_by_date.csv", + index_col=0) + +# zum datum konvertiert +beverages_by_date.index = pd.to_datetime( + beverages_by_date.index, + format="%Y-%m-%d" # normalerweise nicht +) + +print(beverages_by_date) +print(beverages_by_date.index.dtype) +print() +sampler = beverages_by_date.resample("2W") +for el in sampler: + print(el) +print(sampler) + +print(beverages_by_date.loc["2024-02-8":"2024-02-14"]) + +by_weekly = beverages_by_date.resample("2W").agg({ + 'coffee': ["sum", "mean", "std", "count"] +}) +print(by_weekly) + +# bfill und ffill +# interploate = linear +# +daily = beverages_by_date.resample("8h").bfill() +print(daily.loc["2024-02-8":"2024-02-14"]) + +# übung mit zeiten +solar_df = pd.read_csv("../data/Balkonkraftwerk.csv", index_col=0) +solar_df.index = pd.to_datetime(solar_df.index) +print(solar_df) +print(solar_df.columns) + +# 1) Wie sieht es im durchschnitt jeden Tag aus (D) +# 2) An welchen Tagen war die effizientz > 35% +# 3) Stündliche Werte interpolieren (h) (1h), (3h) +# - Komisch