In this tutorial, you’ll learn how to:
You’ll also learn about the differences between the main data structures that Pandas and Python use. To follow along, you can get all of the example code in this tutorial at the link below:
import requests
download_url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv"
target_csv_path = "nba_all_elo.csv"
response = requests.get(download_url)
response.raise_for_status() # Check that the request was successful
with open(target_csv_path, "wb") as f:
f.write(response.content)
print("Download ready.")
import pandas as pd
nba = pd.read_csv("nba_all_elo.csv")
type(ds)
len(nba)
nba.shape
nba.head()
pd.set_option("display.precision", 2)
nba.tail()
nba.info()
nba.describe()
import numpy as np
nba.describe(include=np.object)
nba["team_id"].value_counts()
nba["fran_id"].value_counts()
nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts()
nba.loc[nba["team_id"] == "MNL", "date_game"].agg(("min", "max"))
nba.loc[nba["team_id"] == "MNL", "date_game"].max()
nba.loc[nba["team_id"] == "MNL", "date_game"].min()
revenues = pd.Series([5555, 7000, 1980])
revenues
revenues.values
revenues.index
type(revenues.values)
city_revenues = pd.Series([4200, 8000, 6500], index=["Amsterdam", "London", "New York"])
city_revenues
city_employee_count = pd.Series({"Amsterdam": 5, "London": 8})
city_employee_count
city_employee_count.keys()
"Tokyo" in city_employee_count
"London" in city_employee_count
city_data = pd.DataFrame({"revenue": city_revenues, "employee_count": city_employee_count})
city_data
city_data.index
city_data.values
city_data.axes
city_data.axes[0]
city_data.axes[1]
city_data.keys()
"revenue" in city_data
"Tokyo" in city_data
nba.index
nba.axes
"pts" in nba.keys()
city_revenues["London"]
city_revenues[1]
city_revenues[-1]
city_revenues[1:]
city_revenues["New York"]
colors = pd.Series(["red", "purple", "blue", "green", "yellow"], index=[1, 2, 3, 5, 8])
colors
colors.loc[1]
colors.iloc[1]
colors.iloc[1:3]
colors.loc[3:8]
colors.iloc[-2]
city_data["revenue"]
type(city_data["revenue"])
city_data.revenue
city_data
toys = pd.DataFrame([
{"name": "ball", "shape": "sphere"},
{"name": "rubik's cube", "shape": "cube"}
])
toys["shape"]
city_data.loc["Amsterdam"]
city_data.iloc[1]
nba.iloc[-2]
nba[
(nba["_iscopy"] == 0) &
(nba["pts"] > 100) &
(nba["opp_pts"] > 100) &
(nba["team_id"] == "BLB")
]
nba[
(nba["_iscopy"] == 0) &
(nba["team_id"].str.startswith("LA")) &
(nba["year_id"] == 1992) &
(nba["notes"].notnull())
]
city_revenues.sum()
city_revenues
city_revenues.max()
points = nba["pts"]
type(points)
points.sum()
nba.groupby("fran_id", sort=False)["pts"].sum()
maxpoints = nba.groupby("fran_id", sort=False)["pts"].max()
maxpoints.max()
nba[
(nba["fran_id"] == "Spurs") &
(nba["year_id"] > 2010)
].groupby(["year_id", "game_result"])["game_id"].count()
nba[
(nba["fran_id"] == "Warriors") &
(nba["year_id"] == 2015)
].groupby(["is_playoffs", "game_result"])["game_id"].count()
df = nba.copy()
df.shape
df["difference"] = df.pts - df.opp_pts
df.shape
df["difference"].max()
renamed_df = df.rename(
columns={"game_result": "result", "game_location": "location"}
)
renamed_df.info()
df.shape
elo_columns = ["elo_i", "elo_n", "opp_elo_i", "opp_elo_n"]
df.drop(elo_columns, inplace=True, axis=1)
df.shape
df.info()
df["date_game"] = pd.to_datetime(df["date_game"])
df["game_location"].nunique()
df["game_location"].value_counts()
df["game_location"] = pd.Categorical(df["game_location"])
df["game_location"].dtype
df.info()
df["game_result"].nunique()
df["game_result"].value_counts()
df["game_result"] = pd.Categorical(df["game_result"])
df.info()
rows_without_missing_data = nba.dropna()
rows_without_missing_data.shape
data_without_missing_columns = nba.dropna(axis=1)
data_without_missing_columns.shape
data_with_default_notes = nba.copy()
data_with_default_notes["notes"].fillna(value="no notes at all", inplace=True)
data_with_default_notes["notes"].describe()
nba[nba["pts"] == 0]
nba[(nba["pts"] > nba["opp_pts"]) & (nba["game_result"] != 'W')].empty
nba[(nba["pts"] < nba["opp_pts"]) & (nba["game_result"] != 'L')].empty
further_city_data = pd.DataFrame(
{"revenue": [7000, 3400], "employee_count":[2, 2]},
index=["New York", "Barcelona"]
)
all_city_data = pd.concat([city_data, further_city_data], sort=False)
all_city_data
current_decade = nba[nba["year_id"] > 2010]
current_decade.shape
games_with_notes = nba[nba["notes"].notnull()]
games_with_notes.shape
ers = nba[nba["fran_id"].str.endswith("ers")]
ers.shape
%matplotlib inline
nba[nba["fran_id"] == "Knicks"].groupby("year_id")["pts"].sum().plot()
nba["fran_id"].value_counts().head(10).plot(kind="bar")
nba[
(nba["fran_id"] =="Heat") &
(nba["year_id"] == 2013)
]["game_result"].value_counts().plot(kind="pie")