114 lines
3.5 KiB
Python
114 lines
3.5 KiB
Python
import pandas as pd
|
|
# standardbibliothek
|
|
# Zwei Datenstrukturen
|
|
# Series (eine spalte)
|
|
|
|
# Dem Dataframe
|
|
|
|
cities = {"Stadt": ["London", "Berlin", "Madrid", "Rom",
|
|
"Paris", "Wien", "Bukarest", "Hamburg",
|
|
"Budapest", "Warsaw", "Barcelona",
|
|
"München", "Mailand"],
|
|
"Population": [8615246, 3562166, 3165235, 2874038,
|
|
2273305, 1805681, 1803425, 1760433,
|
|
1754000, 1740119, 1602386, 1493900,
|
|
1350680],
|
|
"Land": ["England", "Deutschland", "Spanien", "Italien",
|
|
"Frankreich", "Österreich", "Romanien",
|
|
"Deutschland", "Ungarn", "Polen", "Spanien",
|
|
"Deutschland", "Italien"]}
|
|
# erstellung des dataframes
|
|
cities_df = pd.DataFrame(cities)
|
|
|
|
# einfügen von neuen Spalten
|
|
areas = [1572, 892, 604, 1285, 105,415, 228, 755, 525, 517, 101, 310, 182]
|
|
# cities_df["Flaeche"] = areas # spalte an
|
|
print(cities_df)
|
|
# insert (position wo die spalte eingefügt wird, name, werte [skalar oder anzahl zeilen]
|
|
# insert(position, name, werte)
|
|
cities_df.insert(2, "Flaeche", areas) # wenn keine spalte namens flaeche existiert
|
|
# cities_df["Flaeche"] = -1 # keine überprüfung ob spalte existiert statt
|
|
print(cities_df)
|
|
print("-"*100)
|
|
# Setzen unseren eigenen Index
|
|
df2 = cities_df.set_index("Stadt", inplace=False) # standardwert
|
|
print(df2)
|
|
print(cities_df)
|
|
cities_df.set_index("Stadt", inplace=True)
|
|
print(cities_df)
|
|
print("\n", "*"*100)
|
|
# Selektion in Pandas
|
|
# spalten selektieren
|
|
print(cities_df["Population"])
|
|
print("-" * 100)
|
|
# Selektion von Zeilen
|
|
# nach index
|
|
print(cities_df.loc["Paris"])
|
|
# nach numerischen index
|
|
print()
|
|
print(cities_df.iloc[7])
|
|
print()
|
|
# slicing
|
|
# slicing bei loc benötigt eine der 2 Bedingungen
|
|
# * index muss einzigartig sein
|
|
# * sortiert sein
|
|
print(cities_df.loc["Rom":"Warsaw"]) # inklusiv:inklusiv
|
|
# ändern den index auf land
|
|
# 1) Index zurücksetzen
|
|
cities_df.reset_index(inplace=True)
|
|
cities_df.set_index("Land", inplace=True)
|
|
print(cities_df)
|
|
print(cities_df.loc["England":"Polen"])
|
|
|
|
cities_df.sort_index(inplace=True)
|
|
print("-"*100)
|
|
print(cities_df)
|
|
print(cities_df.loc["Deutschland":"Italien"])
|
|
print("-"*100)
|
|
print(cities_df.loc["Italien"])
|
|
print("-"*100)
|
|
print(cities_df.iloc[0:5])
|
|
|
|
|
|
cities_df = pd.DataFrame(cities)
|
|
cities_df["Flaeche"] = areas
|
|
print(cities_df)
|
|
cities_df.sort_values(by="Stadt", inplace=True)
|
|
print(cities_df)
|
|
print(cities_df.loc[4])
|
|
print(cities_df.iloc[4])
|
|
print("-"* 100)
|
|
print(cities_df)
|
|
# Selektion nach Information in Spalte
|
|
# boolsche Selektionsmaske
|
|
print(cities_df[cities_df["Population"] > 2e6])
|
|
print(cities_df[cities_df["Population"] > 2_000_000])
|
|
|
|
# Maske selbst
|
|
print(cities_df["Population"] > 2_000_000)
|
|
print()
|
|
print("-" * 100)
|
|
print(cities_df[
|
|
(cities_df["Population"] > 2_000_000) &
|
|
(cities_df["Flaeche"] > 1_000)
|
|
]) # 0000000000001 & 1000010000010001
|
|
|
|
# spalten mit strings (contains unterstützt regulären ausdrücke)
|
|
print(cities_df[cities_df["Land"].str.contains("[E|e]n")])
|
|
print(cities_df[cities_df["Land"] == "Spanien"])
|
|
print()
|
|
print("-" * 100)
|
|
spanische_staedte = cities_df[cities_df["Land"] == "Spanien"]
|
|
spanien_info = spanische_staedte[["Population", "Flaeche"]]
|
|
print(spanien_info.sum())
|
|
|
|
|
|
# & und
|
|
# | oder
|
|
# ^ exklusive_oder (1 + 1 == 0)
|
|
stadt_de_esp = cities_df[(cities_df["Land"] == "Spanien") | (cities_df["Land"] == "Deutschland")]
|
|
info = stadt_de_esp[["Population", "Flaeche"]]
|
|
print(info)
|
|
print(info.sum())
|
|
|