Ch06 Time Series Analysis with pandas¶
DatetimeIndex¶
# Let's start by importing the packages we use in this chapter
# and by setting the plotting backend to Plotly
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"
# This creates a DatetimeIndex based on a start timestamp,
# number of periods and frequency ("D" = daily).
daily_index = pd.date_range("2020-02-28", periods=4, freq="D")
daily_index
DatetimeIndex(['2020-02-28', '2020-02-29', '2020-03-01', '2020-03-02'], dtype='datetime64[ns]', freq='D')
# This creates a DatetimeIndex based on start/end timestamp.
# The frequency is set to "weekly on Sundays" ("W-SUN").
weekly_index = pd.date_range("2020-01-01", "2020-01-31", freq="W-SUN")
weekly_index
DatetimeIndex(['2020-01-05', '2020-01-12', '2020-01-19', '2020-01-26'], dtype='datetime64[ns]', freq='W-SUN')
# Construct a DataFrame based on the weekly_index. This could be
# the visitor count of a museum that only opens on Sundays.
pd.DataFrame(data=[21, 15, 33, 34],
columns=["visitors"], index=weekly_index)
visitors | |
---|---|
2020-01-05 | 21 |
2020-01-12 | 15 |
2020-01-19 | 33 |
2020-01-26 | 34 |
msft = pd.read_csv("csv/MSFT.csv")
msft.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8622 entries, 0 to 8621
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 8622 non-null object
1 Open 8622 non-null float64
2 High 8622 non-null float64
3 Low 8622 non-null float64
4 Close 8622 non-null float64
5 Adj Close 8622 non-null float64
6 Volume 8622 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 471.6+ KB
msft.loc[:, "Date"] = pd.to_datetime(msft["Date"])
msft.dtypes
Date datetime64[ns]
Open float64
High float64
Low float64
Close float64
Adj Close float64
Volume int64
dtype: object
msft = pd.read_csv("csv/MSFT.csv",
index_col="Date", parse_dates=["Date"])
msft.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8622 entries, 1986-03-13 to 2020-05-27
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 8622 non-null float64
1 High 8622 non-null float64
2 Low 8622 non-null float64
3 Close 8622 non-null float64
4 Adj Close 8622 non-null float64
5 Volume 8622 non-null int64
dtypes: float64(5), int64(1)
memory usage: 471.5 KB
msft.loc[:, "Volume"] = msft["Volume"].astype("float")
msft["Volume"].dtype
dtype('float64')
msft = msft.sort_index()
msft.index.date
array([datetime.date(1986, 3, 13), datetime.date(1986, 3, 14),
datetime.date(1986, 3, 17), ..., datetime.date(2020, 5, 22),
datetime.date(2020, 5, 26), datetime.date(2020, 5, 27)],
dtype=object)
msft.loc["2019", "Adj Close"]
Date
2019-01-02 99.099190
2019-01-03 95.453529
2019-01-04 99.893005
2019-01-07 100.020401
2019-01-08 100.745613
...
2019-12-24 156.515396
2019-12-26 157.798309
2019-12-27 158.086731
2019-12-30 156.724243
2019-12-31 156.833633
Name: Adj Close, Length: 252, dtype: float64
msft.loc["2019-06":"2020-05", "Adj Close"].plot()
Working with Time Zones¶
# Add the time information to the date
msft_close = msft.loc[:, ["Adj Close"]].copy()
msft_close.index = msft_close.index + pd.DateOffset(hours=16)
msft_close.head(2)
Adj Close | |
---|---|
Date | |
1986-03-13 16:00:00 | 0.062205 |
1986-03-14 16:00:00 | 0.064427 |
# Make the timestamps time-zone-aware
msft_close = msft_close.tz_localize("America/New_York")
msft_close.head(2)
Adj Close | |
---|---|
Date | |
1986-03-13 16:00:00-05:00 | 0.062205 |
1986-03-14 16:00:00-05:00 | 0.064427 |
msft_close = msft_close.tz_convert("UTC")
msft_close.loc["2020-01-02", "Adj Close"] # 21:00 without DST
Date
2020-01-02 21:00:00+00:00 159.737595
Name: Adj Close, dtype: float64
msft_close.loc["2020-05-01", "Adj Close"] # 20:00 with DST
Date
2020-05-01 20:00:00+00:00 174.085175
Name: Adj Close, dtype: float64
Common Time Series Manipulations¶
Shifting and Percentage Changes¶
msft_close.head()
Adj Close | |
---|---|
Date | |
1986-03-13 21:00:00+00:00 | 0.062205 |
1986-03-14 21:00:00+00:00 | 0.064427 |
1986-03-17 21:00:00+00:00 | 0.065537 |
1986-03-18 21:00:00+00:00 | 0.063871 |
1986-03-19 21:00:00+00:00 | 0.062760 |
msft_close.shift(1).head()
Adj Close | |
---|---|
Date | |
1986-03-13 21:00:00+00:00 | NaN |
1986-03-14 21:00:00+00:00 | 0.062205 |
1986-03-17 21:00:00+00:00 | 0.064427 |
1986-03-18 21:00:00+00:00 | 0.065537 |
1986-03-19 21:00:00+00:00 | 0.063871 |
returns = np.log(msft_close / msft_close.shift(1))
returns = returns.rename(columns={"Adj Close": "returns"})
returns.head()
returns | |
---|---|
Date | |
1986-03-13 21:00:00+00:00 | NaN |
1986-03-14 21:00:00+00:00 | 0.035097 |
1986-03-17 21:00:00+00:00 | 0.017082 |
1986-03-18 21:00:00+00:00 | -0.025749 |
1986-03-19 21:00:00+00:00 | -0.017547 |
# Plot a histogram with the daily log returns
returns.plot.hist()
simple_rets = msft_close.pct_change()
simple_rets = simple_rets.rename(columns={"Adj Close": "simple rets"})
simple_rets.head()
simple rets | |
---|---|
Date | |
1986-03-13 21:00:00+00:00 | NaN |
1986-03-14 21:00:00+00:00 | 0.035721 |
1986-03-17 21:00:00+00:00 | 0.017229 |
1986-03-18 21:00:00+00:00 | -0.025421 |
1986-03-19 21:00:00+00:00 | -0.017394 |
Rebasing and Correlation¶
parts = [] # List to collect individual DataFrames
for ticker in ["AAPL", "AMZN", "GOOGL", "MSFT"]:
# "usecols" allows us to only read in the Date and Adj Close
adj_close = pd.read_csv(f"csv/{ticker}.csv",
index_col="Date", parse_dates=["Date"],
usecols=["Date", "Adj Close"])
# Rename the column into the ticker symbol
adj_close = adj_close.rename(columns={"Adj Close": ticker})
# Append the stock's DataFrame to the parts list
parts.append(adj_close)
# Combine the 4 DataFrames into a single DataFrame
adj_close = pd.concat(parts, axis=1)
adj_close
AAPL | AMZN | GOOGL | MSFT | |
---|---|---|---|---|
Date | ||||
1980-12-12 | 0.405683 | NaN | NaN | NaN |
1980-12-15 | 0.384517 | NaN | NaN | NaN |
1980-12-16 | 0.356296 | NaN | NaN | NaN |
1980-12-17 | 0.365115 | NaN | NaN | NaN |
1980-12-18 | 0.375698 | NaN | NaN | NaN |
... | ... | ... | ... | ... |
2020-05-22 | 318.890015 | 2436.879883 | 1413.239990 | 183.509995 |
2020-05-26 | 316.730011 | 2421.860107 | 1421.369995 | 181.570007 |
2020-05-27 | 318.109985 | 2410.389893 | 1420.280029 | 181.809998 |
2020-05-28 | 318.250000 | 2401.100098 | 1418.239990 | NaN |
2020-05-29 | 317.940002 | 2442.370117 | 1433.520020 | NaN |
9950 rows × 4 columns
adj_close = adj_close.dropna()
adj_close.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3970 entries, 2004-08-19 to 2020-05-27
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AAPL 3970 non-null float64
1 AMZN 3970 non-null float64
2 GOOGL 3970 non-null float64
3 MSFT 3970 non-null float64
dtypes: float64(4)
memory usage: 155.1 KB
# Use a sample from June 2019 - May 2020
adj_close_sample = adj_close.loc["2019-06":"2020-05", :]
rebased_prices = adj_close_sample / adj_close_sample.iloc[0, :] * 100
rebased_prices.head(2)
AAPL | AMZN | GOOGL | MSFT | |
---|---|---|---|---|
Date | ||||
2019-06-03 | 100.000000 | 100.000000 | 100.00000 | 100.000000 |
2019-06-04 | 103.658406 | 102.178197 | 101.51626 | 102.770372 |
rebased_prices.plot()
# Correlation of daily log returns
returns = np.log(adj_close / adj_close.shift(1))
returns.corr()
AAPL | AMZN | GOOGL | MSFT | |
---|---|---|---|---|
AAPL | 1.000000 | 0.424910 | 0.503497 | 0.486065 |
AMZN | 0.424910 | 1.000000 | 0.486690 | 0.485725 |
GOOGL | 0.503497 | 0.486690 | 1.000000 | 0.525645 |
MSFT | 0.486065 | 0.485725 | 0.525645 | 1.000000 |
import plotly.express as px
fig = px.imshow(returns.corr(),
x=adj_close.columns,
y=adj_close.columns,
color_continuous_scale=list(
reversed(px.colors.sequential.RdBu)),
zmin=-1, zmax=1)
fig.show()
Resampling¶
end_of_month = adj_close.resample("M").last()
end_of_month.head()
AAPL | AMZN | GOOGL | MSFT | |
---|---|---|---|---|
Date | ||||
2004-08-31 | 2.132708 | 38.139999 | 51.236237 | 17.673630 |
2004-09-30 | 2.396127 | 40.860001 | 64.864868 | 17.900215 |
2004-10-31 | 3.240182 | 34.130001 | 95.415413 | 18.107374 |
2004-11-30 | 4.146072 | 39.680000 | 91.081078 | 19.344421 |
2004-12-31 | 3.982207 | 44.290001 | 96.491493 | 19.279480 |
end_of_month.resample("D").asfreq().head() # No transformation
AAPL | AMZN | GOOGL | MSFT | |
---|---|---|---|---|
Date | ||||
2004-08-31 | 2.132708 | 38.139999 | 51.236237 | 17.67363 |
2004-09-01 | NaN | NaN | NaN | NaN |
2004-09-02 | NaN | NaN | NaN | NaN |
2004-09-03 | NaN | NaN | NaN | NaN |
2004-09-04 | NaN | NaN | NaN | NaN |
end_of_month.resample("W-FRI").ffill().head() # Forward fill
AAPL | AMZN | GOOGL | MSFT | |
---|---|---|---|---|
Date | ||||
2004-09-03 | 2.132708 | 38.139999 | 51.236237 | 17.673630 |
2004-09-10 | 2.132708 | 38.139999 | 51.236237 | 17.673630 |
2004-09-17 | 2.132708 | 38.139999 | 51.236237 | 17.673630 |
2004-09-24 | 2.132708 | 38.139999 | 51.236237 | 17.673630 |
2004-10-01 | 2.396127 | 40.860001 | 64.864868 | 17.900215 |
Rolling Windows¶
# Plot the moving average for MSFT with data from 2019
msft19 = msft.loc["2019", ["Adj Close"]].copy()
# Add the 25 day moving average as a new column to the DataFrame
msft19.loc[:, "25day average"] = msft19["Adj Close"].rolling(25).mean()
msft19.plot()