Ch05 Data Analysis with pandas¶

DataFrame and Series¶

import pandas as pd

pd.read_excel("xl/course_participants.xlsx")

data=[["Mark", 55, "Italy", 4.5, "Europe"],
      ["John", 33, "USA", 6.7, "America"],
      ["Tim", 41, "USA", 3.9, "America"],
      ["Jenny", 12, "Germany", 9.0, "Europe"]]
df = pd.DataFrame(data=data,
                  columns=["name", "age", "country",
                           "score", "continent"],
                  index=[1001, 1000, 1002, 1003])
df

df.info()

Index¶

df.index

df.index.name = "user_id"
df

# "reset_index" turns the index into a column, replacing the
# index with the default index. This corresponds to the DataFrame
# from the beginning that we loaded from Excel.
df.reset_index()

# "reset_index" turns "user_id" into a regular column and
# "set_index" turns the column "name" into the index
df.reset_index().set_index("name")

df.reindex([999, 1000, 1001, 1004])

df.sort_index()

df.sort_values(["continent", "age"])

Columns¶

df.columns

df.columns.name = "properties"
df

df.rename(columns={"name": "First Name", "age": "Age"})

df.drop(columns=["name", "country"],
        index=[1000, 1003])

df.T  # Shortcut for df.transpose()

df.loc[:, ["continent", "country", "name", "age", "score"]]

Data Manipulation¶

Selecting Data¶

# Using scalars for both row and column selection returns a scalar
df.loc[1001, "name"]

# Using a scalar on either the row or column selection returns a Series
df.loc[[1001, 1002], "age"]

# Selecting multiple rows and columns returns a DataFrame
df.loc[:1002, ["name", "country"]]

df.iloc[0, 0]  # Returns a Scalar

df.iloc[[0, 2], 1]  # Returns a Series

df.iloc[:3, [0, 2]]  # Returns a DataFrame

tf = (df["age"] > 40) & (df["country"] == "USA")
tf  # This is a Series with only True/False

df.loc[tf, :]

df.loc[df.index > 1001, :]

df.loc[df["country"].isin(["Italy", "Germany"]), :]

# This could be the yearly rainfall in millimeters
rainfall = pd.DataFrame(data={"City 1": [300.1, 100.2],
                              "City 2": [400.3, 300.4],
                              "City 3": [1000.5, 1100.6]})
rainfall

rainfall < 400

rainfall[rainfall < 400]

# A MultiIndex needs to be sorted
df_multi = df.reset_index().set_index(["continent", "country"])
df_multi = df_multi.sort_index()
df_multi

df_multi.loc["Europe", :]

df_multi.loc[("Europe", "Italy"), :]

df_multi.reset_index(level=0)

Setting Data¶

# Copy the DataFrame first to leave the original untouched
df2 = df.copy()

df2.loc[1000, "name"] = "JOHN"
df2

df2.loc[[1000, 1001], "score"] = [3, 4]
df2

tf = (df2["age"] < 20) | (df2["country"] == "USA")
df2.loc[tf, "name"] = "xxx"
df2

# Copy the DataFrame first to leave the original untouched
rainfall2 = rainfall.copy()
rainfall2

# Set the values to 0 wherever they are below 400
rainfall2[rainfall2 < 400] = 0
rainfall2

df2.replace("USA", "U.S.")

df2.loc[:, "discount"] = 0
df2.loc[:, "price"] = [49.9, 49.9, 99.9, 99.9]
df2

df2 = df.copy()  # let's start with a fresh copy
df2.loc[:, "birth year"] = 2021 - df2["age"]
df2

Missing Data¶

df2 = df.copy() # let's start with a fresh copy
df2.loc[1000, "score"] = None
df2.loc[1003, :] = None
df2

df2.dropna()

df2.dropna(how="all")

df2.isna()

df2.fillna({"score": df2["score"].mean()})

Duplicate Data¶

df.drop_duplicates(["country", "continent"])

df["country"].is_unique

df["country"].unique()

# By default, it marks only duplicates as True, i.e.
# without the first occurrence
df["country"].duplicated()

# To get all rows where "country" is duplicated, use
# keep=False
df.loc[df["country"].duplicated(keep=False), :]

Arithmetic Operations¶

rainfall

rainfall + 100

more_rainfall = pd.DataFrame(data=[[100, 200], [300, 400]],
                             index=[1, 2],
                             columns=["City 1", "City 4"])
more_rainfall

rainfall + more_rainfall

rainfall.add(more_rainfall, fill_value=0)

# A Series taken from a row
rainfall.loc[1, :]

rainfall + rainfall.loc[1, :]

# A Series taken from a column
rainfall.loc[:, "City 2"]

rainfall.add(rainfall.loc[:, "City 2"], axis=0)

# Let's create a new DataFrame
users = pd.DataFrame(data=[" mArk ", "JOHN  ", "Tim", " jenny"],
                     columns=["name"])
users

users_cleaned = users.loc[:, "name"].str.strip().str.capitalize()
users_cleaned

users_cleaned.str.startswith("J")

Applying a Function¶

rainfall

def format_string(x):
    return f"{x:,.2f}"

# Note that we pass in the function without calling it,
# i.e., format_string and not format_string()!
rainfall.applymap(format_string)

rainfall.applymap(lambda x: f"{x:,.2f}")

Combining DataFrames¶

Concatenating¶

data=[[15, "France", 4.1, "Becky"],
      [44, "Canada", 6.1, "Leanne"]]
more_users = pd.DataFrame(data=data,
                          columns=["age", "country", "score", "name"],
                          index=[1000, 1011])
more_users

pd.concat([df, more_users], axis=0)

data=[[3, 4],
      [5, 6]]
more_categories = pd.DataFrame(data=data,
                               columns=["quizzes", "logins"],
                               index=[1000, 2000])
more_categories

pd.concat([df, more_categories], axis=1)

Joining and Merging¶

df1 = pd.DataFrame(data=[[1, 2], [3, 4], [5, 6]],
                   columns=["A", "B"])
df1

df2 = pd.DataFrame(data=[[10, 20], [30, 40]],
                   columns=["C", "D"], index=[1, 3])
df2

df1.join(df2, how="inner")

df1.join(df2, how="left")

df1.join(df2, how="right")

df1.join(df2, how="outer")

# Add a column called "category" to both DataFrames
df1["category"] = ["a", "b", "c"]
df2["category"] = ["c", "b"]

df1

df2

df1.merge(df2, how="inner", on=["category"])

df1.merge(df2, how="left", on=["category"])

Data Aggregation and Descriptive Statistics¶

Descriptive Statistics¶

rainfall

rainfall.mean()

rainfall.mean(axis=1)

Grouping¶

df.groupby(["continent"]).mean()

df.groupby(["continent", "country"]).mean()

df.groupby(["continent"]).agg(lambda x: x.max() - x.min())

Pivoting and Melting¶

data = [["Oranges", "North", 12.30],
        ["Apples", "South", 10.55],
        ["Oranges", "South", 22.00],
        ["Bananas", "South", 5.90],
        ["Bananas", "North", 31.30],
        ["Oranges", "North", 13.10]]

sales = pd.DataFrame(data=data,
                     columns=["Fruit", "Region", "Revenue"])
sales

pivot = pd.pivot_table(sales,
                       index="Fruit", columns="Region",
                       values="Revenue", aggfunc="sum",
                       margins=True, margins_name="Total")
pivot

pd.melt(pivot.iloc[:-1,:-1].reset_index(),
        id_vars="Fruit",
        value_vars=["North", "South"], value_name="Revenue")

Plotting¶

Matplotlib¶

import numpy as np
%matplotlib inline
# Or %matplotlib notebook

data = pd.DataFrame(data=np.random.rand(4, 4) * 100000,
                    index=["Q1", "Q2", "Q3", "Q4"],
                    columns=["East", "West", "North", "South"])
data.index.name = "Quarters"
data.columns.name = "Region"
data

data.plot()  # Shortcut for data.plot.line()

Plotly¶

# Set the plotting backend to Plotly
pd.options.plotting.backend = "plotly"

data.plot()

# Display the same data as bar plot
data.plot.bar(barmode="group")

Data Import and Export¶

Exporting to a CSV file¶

df.to_csv("course_participants.csv")

Importing a CSV file¶

msft = pd.read_csv("csv/MSFT.csv")

msft.info()

# I am selecting a few columns because of space issues
# You can also just run: msft.head()
msft.loc[:, ["Date", "Adj Close", "Volume"]].head()

msft.loc[:, ["Date", "Adj Close", "Volume"]].tail(2)

msft.loc[:, ["Adj Close", "Volume"]].describe()

# The line break in the URL is only to make it fit on the page
url = ("https://raw.githubusercontent.com/fzumstein/"
       "python-for-excel/1st-edition/csv/MSFT.csv")
msft = pd.read_csv(url)

msft.loc[:, ["Date", "Adj Close", "Volume"]].head(2)

My sample book

Ch05 Data Analysis with pandas¶

DataFrame and Series¶

Index¶

Columns¶

Data Manipulation¶

Selecting Data¶

Setting Data¶

Missing Data¶

Duplicate Data¶

Arithmetic Operations¶

Applying a Function¶

Combining DataFrames¶

Concatenating¶

Joining and Merging¶

Data Aggregation and Descriptive Statistics¶

Descriptive Statistics¶

Grouping¶

Pivoting and Melting¶

Plotting¶

Matplotlib¶

Plotly¶

Data Import and Export¶

Exporting to a CSV file¶

Importing a CSV file¶