Ch06 Time Series Analysis with pandas

DatetimeIndex

# Let's start by importing the packages we use in this chapter
# and by setting the plotting backend to Plotly
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"
# This creates a DatetimeIndex based on a start timestamp,
# number of periods and frequency ("D" = daily).
daily_index = pd.date_range("2020-02-28", periods=4, freq="D")
daily_index
DatetimeIndex(['2020-02-28', '2020-02-29', '2020-03-01', '2020-03-02'], dtype='datetime64[ns]', freq='D')
# This creates a DatetimeIndex based on start/end timestamp.
# The frequency is set to "weekly on Sundays" ("W-SUN").
weekly_index = pd.date_range("2020-01-01", "2020-01-31", freq="W-SUN")
weekly_index
DatetimeIndex(['2020-01-05', '2020-01-12', '2020-01-19', '2020-01-26'], dtype='datetime64[ns]', freq='W-SUN')
# Construct a DataFrame based on the weekly_index. This could be
# the visitor count of a museum that only opens on Sundays.
pd.DataFrame(data=[21, 15, 33, 34],
             columns=["visitors"], index=weekly_index)
visitors
2020-01-05 21
2020-01-12 15
2020-01-19 33
2020-01-26 34
msft = pd.read_csv("csv/MSFT.csv")
msft.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8622 entries, 0 to 8621
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       8622 non-null   object 
 1   Open       8622 non-null   float64
 2   High       8622 non-null   float64
 3   Low        8622 non-null   float64
 4   Close      8622 non-null   float64
 5   Adj Close  8622 non-null   float64
 6   Volume     8622 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 471.6+ KB
msft.loc[:, "Date"] = pd.to_datetime(msft["Date"])
msft.dtypes
Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object
msft = pd.read_csv("csv/MSFT.csv",
                   index_col="Date", parse_dates=["Date"])
msft.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8622 entries, 1986-03-13 to 2020-05-27
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       8622 non-null   float64
 1   High       8622 non-null   float64
 2   Low        8622 non-null   float64
 3   Close      8622 non-null   float64
 4   Adj Close  8622 non-null   float64
 5   Volume     8622 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 471.5 KB
msft.loc[:, "Volume"] = msft["Volume"].astype("float")
msft["Volume"].dtype
dtype('float64')
msft = msft.sort_index()
msft.index.date
array([datetime.date(1986, 3, 13), datetime.date(1986, 3, 14),
       datetime.date(1986, 3, 17), ..., datetime.date(2020, 5, 22),
       datetime.date(2020, 5, 26), datetime.date(2020, 5, 27)],
      dtype=object)
msft.loc["2019", "Adj Close"]
Date
2019-01-02     99.099190
2019-01-03     95.453529
2019-01-04     99.893005
2019-01-07    100.020401
2019-01-08    100.745613
                 ...    
2019-12-24    156.515396
2019-12-26    157.798309
2019-12-27    158.086731
2019-12-30    156.724243
2019-12-31    156.833633
Name: Adj Close, Length: 252, dtype: float64
msft.loc["2019-06":"2020-05", "Adj Close"].plot()

Working with Time Zones

# Add the time information to the date
msft_close = msft.loc[:, ["Adj Close"]].copy()
msft_close.index = msft_close.index + pd.DateOffset(hours=16)
msft_close.head(2)
Adj Close
Date
1986-03-13 16:00:00 0.062205
1986-03-14 16:00:00 0.064427
# Make the timestamps time-zone-aware
msft_close = msft_close.tz_localize("America/New_York")
msft_close.head(2)
Adj Close
Date
1986-03-13 16:00:00-05:00 0.062205
1986-03-14 16:00:00-05:00 0.064427
msft_close = msft_close.tz_convert("UTC")
msft_close.loc["2020-01-02", "Adj Close"]  # 21:00 without DST
Date
2020-01-02 21:00:00+00:00    159.737595
Name: Adj Close, dtype: float64
msft_close.loc["2020-05-01", "Adj Close"]  # 20:00 with DST
Date
2020-05-01 20:00:00+00:00    174.085175
Name: Adj Close, dtype: float64

Common Time Series Manipulations

Shifting and Percentage Changes

msft_close.head()
Adj Close
Date
1986-03-13 21:00:00+00:00 0.062205
1986-03-14 21:00:00+00:00 0.064427
1986-03-17 21:00:00+00:00 0.065537
1986-03-18 21:00:00+00:00 0.063871
1986-03-19 21:00:00+00:00 0.062760
msft_close.shift(1).head()
Adj Close
Date
1986-03-13 21:00:00+00:00 NaN
1986-03-14 21:00:00+00:00 0.062205
1986-03-17 21:00:00+00:00 0.064427
1986-03-18 21:00:00+00:00 0.065537
1986-03-19 21:00:00+00:00 0.063871
returns = np.log(msft_close / msft_close.shift(1))
returns = returns.rename(columns={"Adj Close": "returns"})
returns.head()
returns
Date
1986-03-13 21:00:00+00:00 NaN
1986-03-14 21:00:00+00:00 0.035097
1986-03-17 21:00:00+00:00 0.017082
1986-03-18 21:00:00+00:00 -0.025749
1986-03-19 21:00:00+00:00 -0.017547
# Plot a histogram with the daily log returns
returns.plot.hist()
simple_rets = msft_close.pct_change()
simple_rets = simple_rets.rename(columns={"Adj Close": "simple rets"})
simple_rets.head()
simple rets
Date
1986-03-13 21:00:00+00:00 NaN
1986-03-14 21:00:00+00:00 0.035721
1986-03-17 21:00:00+00:00 0.017229
1986-03-18 21:00:00+00:00 -0.025421
1986-03-19 21:00:00+00:00 -0.017394

Rebasing and Correlation

parts = []  # List to collect individual DataFrames
for ticker in ["AAPL", "AMZN", "GOOGL", "MSFT"]:
  # "usecols" allows us to only read in the Date and Adj Close
  adj_close = pd.read_csv(f"csv/{ticker}.csv",
                          index_col="Date", parse_dates=["Date"],
                          usecols=["Date", "Adj Close"])
  # Rename the column into the ticker symbol
  adj_close = adj_close.rename(columns={"Adj Close": ticker})
  # Append the stock's DataFrame to the parts list
  parts.append(adj_close)
# Combine the 4 DataFrames into a single DataFrame
adj_close = pd.concat(parts, axis=1)
adj_close
AAPL AMZN GOOGL MSFT
Date
1980-12-12 0.405683 NaN NaN NaN
1980-12-15 0.384517 NaN NaN NaN
1980-12-16 0.356296 NaN NaN NaN
1980-12-17 0.365115 NaN NaN NaN
1980-12-18 0.375698 NaN NaN NaN
... ... ... ... ...
2020-05-22 318.890015 2436.879883 1413.239990 183.509995
2020-05-26 316.730011 2421.860107 1421.369995 181.570007
2020-05-27 318.109985 2410.389893 1420.280029 181.809998
2020-05-28 318.250000 2401.100098 1418.239990 NaN
2020-05-29 317.940002 2442.370117 1433.520020 NaN

9950 rows × 4 columns

adj_close = adj_close.dropna()
adj_close.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3970 entries, 2004-08-19 to 2020-05-27
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    3970 non-null   float64
 1   AMZN    3970 non-null   float64
 2   GOOGL   3970 non-null   float64
 3   MSFT    3970 non-null   float64
dtypes: float64(4)
memory usage: 155.1 KB
# Use a sample from June 2019 - May 2020
adj_close_sample = adj_close.loc["2019-06":"2020-05", :]
rebased_prices = adj_close_sample / adj_close_sample.iloc[0, :] * 100
rebased_prices.head(2)
AAPL AMZN GOOGL MSFT
Date
2019-06-03 100.000000 100.000000 100.00000 100.000000
2019-06-04 103.658406 102.178197 101.51626 102.770372
rebased_prices.plot()
# Correlation of daily log returns
returns = np.log(adj_close / adj_close.shift(1))
returns.corr()
AAPL AMZN GOOGL MSFT
AAPL 1.000000 0.424910 0.503497 0.486065
AMZN 0.424910 1.000000 0.486690 0.485725
GOOGL 0.503497 0.486690 1.000000 0.525645
MSFT 0.486065 0.485725 0.525645 1.000000
import plotly.express as px
fig = px.imshow(returns.corr(),
                x=adj_close.columns,
                y=adj_close.columns,
                color_continuous_scale=list(
  reversed(px.colors.sequential.RdBu)),
  zmin=-1, zmax=1)
fig.show()

Resampling

end_of_month = adj_close.resample("M").last()
end_of_month.head()
AAPL AMZN GOOGL MSFT
Date
2004-08-31 2.132708 38.139999 51.236237 17.673630
2004-09-30 2.396127 40.860001 64.864868 17.900215
2004-10-31 3.240182 34.130001 95.415413 18.107374
2004-11-30 4.146072 39.680000 91.081078 19.344421
2004-12-31 3.982207 44.290001 96.491493 19.279480
end_of_month.resample("D").asfreq().head()  # No transformation
AAPL AMZN GOOGL MSFT
Date
2004-08-31 2.132708 38.139999 51.236237 17.67363
2004-09-01 NaN NaN NaN NaN
2004-09-02 NaN NaN NaN NaN
2004-09-03 NaN NaN NaN NaN
2004-09-04 NaN NaN NaN NaN
end_of_month.resample("W-FRI").ffill().head()  # Forward fill
AAPL AMZN GOOGL MSFT
Date
2004-09-03 2.132708 38.139999 51.236237 17.673630
2004-09-10 2.132708 38.139999 51.236237 17.673630
2004-09-17 2.132708 38.139999 51.236237 17.673630
2004-09-24 2.132708 38.139999 51.236237 17.673630
2004-10-01 2.396127 40.860001 64.864868 17.900215

Rolling Windows

# Plot the moving average for MSFT with data from 2019
msft19 = msft.loc["2019", ["Adj Close"]].copy()
# Add the 25 day moving average as a new column to the DataFrame
msft19.loc[:, "25day average"] = msft19["Adj Close"].rolling(25).mean()
msft19.plot()