Ch05 Data Analysis with pandas¶
DataFrame and Series¶
import pandas as pd
pd.read_excel("xl/course_participants.xlsx")
data=[["Mark", 55, "Italy", 4.5, "Europe"],
["John", 33, "USA", 6.7, "America"],
["Tim", 41, "USA", 3.9, "America"],
["Jenny", 12, "Germany", 9.0, "Europe"]]
df = pd.DataFrame(data=data,
columns=["name", "age", "country",
"score", "continent"],
index=[1001, 1000, 1002, 1003])
df
df.info()
Index¶
df.index
df.index.name = "user_id"
df
# "reset_index" turns the index into a column, replacing the
# index with the default index. This corresponds to the DataFrame
# from the beginning that we loaded from Excel.
df.reset_index()
# "reset_index" turns "user_id" into a regular column and
# "set_index" turns the column "name" into the index
df.reset_index().set_index("name")
df.reindex([999, 1000, 1001, 1004])
df.sort_index()
df.sort_values(["continent", "age"])
Columns¶
df.columns
df.columns.name = "properties"
df
df.rename(columns={"name": "First Name", "age": "Age"})
df.drop(columns=["name", "country"],
index=[1000, 1003])
df.T # Shortcut for df.transpose()
df.loc[:, ["continent", "country", "name", "age", "score"]]
Data Manipulation¶
Selecting Data¶
# Using scalars for both row and column selection returns a scalar
df.loc[1001, "name"]
# Using a scalar on either the row or column selection returns a Series
df.loc[[1001, 1002], "age"]
# Selecting multiple rows and columns returns a DataFrame
df.loc[:1002, ["name", "country"]]
df.iloc[0, 0] # Returns a Scalar
df.iloc[[0, 2], 1] # Returns a Series
df.iloc[:3, [0, 2]] # Returns a DataFrame
tf = (df["age"] > 40) & (df["country"] == "USA")
tf # This is a Series with only True/False
df.loc[tf, :]
df.loc[df.index > 1001, :]
df.loc[df["country"].isin(["Italy", "Germany"]), :]
# This could be the yearly rainfall in millimeters
rainfall = pd.DataFrame(data={"City 1": [300.1, 100.2],
"City 2": [400.3, 300.4],
"City 3": [1000.5, 1100.6]})
rainfall
rainfall < 400
rainfall[rainfall < 400]
# A MultiIndex needs to be sorted
df_multi = df.reset_index().set_index(["continent", "country"])
df_multi = df_multi.sort_index()
df_multi
df_multi.loc["Europe", :]
df_multi.loc[("Europe", "Italy"), :]
df_multi.reset_index(level=0)
Setting Data¶
# Copy the DataFrame first to leave the original untouched
df2 = df.copy()
df2.loc[1000, "name"] = "JOHN"
df2
df2.loc[[1000, 1001], "score"] = [3, 4]
df2
tf = (df2["age"] < 20) | (df2["country"] == "USA")
df2.loc[tf, "name"] = "xxx"
df2
# Copy the DataFrame first to leave the original untouched
rainfall2 = rainfall.copy()
rainfall2
# Set the values to 0 wherever they are below 400
rainfall2[rainfall2 < 400] = 0
rainfall2
df2.replace("USA", "U.S.")
df2.loc[:, "discount"] = 0
df2.loc[:, "price"] = [49.9, 49.9, 99.9, 99.9]
df2
df2 = df.copy() # let's start with a fresh copy
df2.loc[:, "birth year"] = 2021 - df2["age"]
df2
Missing Data¶
df2 = df.copy() # let's start with a fresh copy
df2.loc[1000, "score"] = None
df2.loc[1003, :] = None
df2
df2.dropna()
df2.dropna(how="all")
df2.isna()
df2.fillna({"score": df2["score"].mean()})
Duplicate Data¶
df.drop_duplicates(["country", "continent"])
df["country"].is_unique
df["country"].unique()
# By default, it marks only duplicates as True, i.e.
# without the first occurrence
df["country"].duplicated()
# To get all rows where "country" is duplicated, use
# keep=False
df.loc[df["country"].duplicated(keep=False), :]
Arithmetic Operations¶
rainfall
rainfall + 100
more_rainfall = pd.DataFrame(data=[[100, 200], [300, 400]],
index=[1, 2],
columns=["City 1", "City 4"])
more_rainfall
rainfall + more_rainfall
rainfall.add(more_rainfall, fill_value=0)
# A Series taken from a row
rainfall.loc[1, :]
rainfall + rainfall.loc[1, :]
# A Series taken from a column
rainfall.loc[:, "City 2"]
rainfall.add(rainfall.loc[:, "City 2"], axis=0)
# Let's create a new DataFrame
users = pd.DataFrame(data=[" mArk ", "JOHN ", "Tim", " jenny"],
columns=["name"])
users
users_cleaned = users.loc[:, "name"].str.strip().str.capitalize()
users_cleaned
users_cleaned.str.startswith("J")
Applying a Function¶
rainfall
def format_string(x):
return f"{x:,.2f}"
# Note that we pass in the function without calling it,
# i.e., format_string and not format_string()!
rainfall.applymap(format_string)
rainfall.applymap(lambda x: f"{x:,.2f}")
Combining DataFrames¶
Concatenating¶
data=[[15, "France", 4.1, "Becky"],
[44, "Canada", 6.1, "Leanne"]]
more_users = pd.DataFrame(data=data,
columns=["age", "country", "score", "name"],
index=[1000, 1011])
more_users
pd.concat([df, more_users], axis=0)
data=[[3, 4],
[5, 6]]
more_categories = pd.DataFrame(data=data,
columns=["quizzes", "logins"],
index=[1000, 2000])
more_categories
pd.concat([df, more_categories], axis=1)
Joining and Merging¶
df1 = pd.DataFrame(data=[[1, 2], [3, 4], [5, 6]],
columns=["A", "B"])
df1
df2 = pd.DataFrame(data=[[10, 20], [30, 40]],
columns=["C", "D"], index=[1, 3])
df2
df1.join(df2, how="inner")
df1.join(df2, how="left")
df1.join(df2, how="right")
df1.join(df2, how="outer")
# Add a column called "category" to both DataFrames
df1["category"] = ["a", "b", "c"]
df2["category"] = ["c", "b"]
df1
df2
df1.merge(df2, how="inner", on=["category"])
df1.merge(df2, how="left", on=["category"])
Data Aggregation and Descriptive Statistics¶
Grouping¶
df.groupby(["continent"]).mean()
df.groupby(["continent", "country"]).mean()
df.groupby(["continent"]).agg(lambda x: x.max() - x.min())
Pivoting and Melting¶
data = [["Oranges", "North", 12.30],
["Apples", "South", 10.55],
["Oranges", "South", 22.00],
["Bananas", "South", 5.90],
["Bananas", "North", 31.30],
["Oranges", "North", 13.10]]
sales = pd.DataFrame(data=data,
columns=["Fruit", "Region", "Revenue"])
sales
pivot = pd.pivot_table(sales,
index="Fruit", columns="Region",
values="Revenue", aggfunc="sum",
margins=True, margins_name="Total")
pivot
pd.melt(pivot.iloc[:-1,:-1].reset_index(),
id_vars="Fruit",
value_vars=["North", "South"], value_name="Revenue")
Plotting¶
Matplotlib¶
import numpy as np
%matplotlib inline
# Or %matplotlib notebook
data = pd.DataFrame(data=np.random.rand(4, 4) * 100000,
index=["Q1", "Q2", "Q3", "Q4"],
columns=["East", "West", "North", "South"])
data.index.name = "Quarters"
data.columns.name = "Region"
data
data.plot() # Shortcut for data.plot.line()
Plotly¶
# Set the plotting backend to Plotly
pd.options.plotting.backend = "plotly"
data.plot()
# Display the same data as bar plot
data.plot.bar(barmode="group")
Data Import and Export¶
Exporting to a CSV file¶
df.to_csv("course_participants.csv")
Importing a CSV file¶
msft = pd.read_csv("csv/MSFT.csv")
msft.info()
# I am selecting a few columns because of space issues
# You can also just run: msft.head()
msft.loc[:, ["Date", "Adj Close", "Volume"]].head()
msft.loc[:, ["Date", "Adj Close", "Volume"]].tail(2)
msft.loc[:, ["Adj Close", "Volume"]].describe()
# The line break in the URL is only to make it fit on the page
url = ("https://raw.githubusercontent.com/fzumstein/"
"python-for-excel/1st-edition/csv/MSFT.csv")
msft = pd.read_csv(url)
msft.loc[:, ["Date", "Adj Close", "Volume"]].head(2)