Ch06 Time Series Analysis with pandas¶

DatetimeIndex¶

# Let's start by importing the packages we use in this chapter
# and by setting the plotting backend to Plotly
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"

# This creates a DatetimeIndex based on a start timestamp,
# number of periods and frequency ("D" = daily).
daily_index = pd.date_range("2020-02-28", periods=4, freq="D")
daily_index

DatetimeIndex(['2020-02-28', '2020-02-29', '2020-03-01', '2020-03-02'], dtype='datetime64[ns]', freq='D')

# This creates a DatetimeIndex based on start/end timestamp.
# The frequency is set to "weekly on Sundays" ("W-SUN").
weekly_index = pd.date_range("2020-01-01", "2020-01-31", freq="W-SUN")
weekly_index

DatetimeIndex(['2020-01-05', '2020-01-12', '2020-01-19', '2020-01-26'], dtype='datetime64[ns]', freq='W-SUN')

# Construct a DataFrame based on the weekly_index. This could be
# the visitor count of a museum that only opens on Sundays.
pd.DataFrame(data=[21, 15, 33, 34],
             columns=["visitors"], index=weekly_index)

	visitors
2020-01-05	21
2020-01-12	15
2020-01-19	33
2020-01-26	34

msft = pd.read_csv("csv/MSFT.csv")

msft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8622 entries, 0 to 8621
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       8622 non-null   object 
 1   Open       8622 non-null   float64
 2   High       8622 non-null   float64
 3   Low        8622 non-null   float64
 4   Close      8622 non-null   float64
 5   Adj Close  8622 non-null   float64
 6   Volume     8622 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 471.6+ KB

msft.loc[:, "Date"] = pd.to_datetime(msft["Date"])

msft.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

msft = pd.read_csv("csv/MSFT.csv",
                   index_col="Date", parse_dates=["Date"])

msft.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8622 entries, 1986-03-13 to 2020-05-27
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       8622 non-null   float64
 1   High       8622 non-null   float64
 2   Low        8622 non-null   float64
 3   Close      8622 non-null   float64
 4   Adj Close  8622 non-null   float64
 5   Volume     8622 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 471.5 KB

msft.loc[:, "Volume"] = msft["Volume"].astype("float")
msft["Volume"].dtype

dtype('float64')

msft = msft.sort_index()

msft.index.date

array([datetime.date(1986, 3, 13), datetime.date(1986, 3, 14),
       datetime.date(1986, 3, 17), ..., datetime.date(2020, 5, 22),
       datetime.date(2020, 5, 26), datetime.date(2020, 5, 27)],
      dtype=object)

msft.loc["2019", "Adj Close"]

Date
2019-01-02     99.099190
2019-01-03     95.453529
2019-01-04     99.893005
2019-01-07    100.020401
2019-01-08    100.745613
                 ...    
2019-12-24    156.515396
2019-12-26    157.798309
2019-12-27    158.086731
2019-12-30    156.724243
2019-12-31    156.833633
Name: Adj Close, Length: 252, dtype: float64

msft.loc["2019-06":"2020-05", "Adj Close"].plot()

Working with Time Zones¶

# Add the time information to the date
msft_close = msft.loc[:, ["Adj Close"]].copy()
msft_close.index = msft_close.index + pd.DateOffset(hours=16)
msft_close.head(2)

	Adj Close
Date
1986-03-13 16:00:00	0.062205
1986-03-14 16:00:00	0.064427

# Make the timestamps time-zone-aware
msft_close = msft_close.tz_localize("America/New_York")
msft_close.head(2)

	Adj Close
Date
1986-03-13 16:00:00-05:00	0.062205
1986-03-14 16:00:00-05:00	0.064427

msft_close = msft_close.tz_convert("UTC")
msft_close.loc["2020-01-02", "Adj Close"]  # 21:00 without DST

Date
2020-01-02 21:00:00+00:00    159.737595
Name: Adj Close, dtype: float64

msft_close.loc["2020-05-01", "Adj Close"]  # 20:00 with DST

Date
2020-05-01 20:00:00+00:00    174.085175
Name: Adj Close, dtype: float64

Common Time Series Manipulations¶

Shifting and Percentage Changes¶

msft_close.head()

	Adj Close
Date
1986-03-13 21:00:00+00:00	0.062205
1986-03-14 21:00:00+00:00	0.064427
1986-03-17 21:00:00+00:00	0.065537
1986-03-18 21:00:00+00:00	0.063871
1986-03-19 21:00:00+00:00	0.062760

msft_close.shift(1).head()

	Adj Close
Date
1986-03-13 21:00:00+00:00	NaN
1986-03-14 21:00:00+00:00	0.062205
1986-03-17 21:00:00+00:00	0.064427
1986-03-18 21:00:00+00:00	0.065537
1986-03-19 21:00:00+00:00	0.063871

returns = np.log(msft_close / msft_close.shift(1))
returns = returns.rename(columns={"Adj Close": "returns"})
returns.head()

	returns
Date
1986-03-13 21:00:00+00:00	NaN
1986-03-14 21:00:00+00:00	0.035097
1986-03-17 21:00:00+00:00	0.017082
1986-03-18 21:00:00+00:00	-0.025749
1986-03-19 21:00:00+00:00	-0.017547

# Plot a histogram with the daily log returns
returns.plot.hist()

simple_rets = msft_close.pct_change()
simple_rets = simple_rets.rename(columns={"Adj Close": "simple rets"})
simple_rets.head()

	simple rets
Date
1986-03-13 21:00:00+00:00	NaN
1986-03-14 21:00:00+00:00	0.035721
1986-03-17 21:00:00+00:00	0.017229
1986-03-18 21:00:00+00:00	-0.025421
1986-03-19 21:00:00+00:00	-0.017394

Rebasing and Correlation¶

parts = []  # List to collect individual DataFrames
for ticker in ["AAPL", "AMZN", "GOOGL", "MSFT"]:
  # "usecols" allows us to only read in the Date and Adj Close
  adj_close = pd.read_csv(f"csv/{ticker}.csv",
                          index_col="Date", parse_dates=["Date"],
                          usecols=["Date", "Adj Close"])
  # Rename the column into the ticker symbol
  adj_close = adj_close.rename(columns={"Adj Close": ticker})
  # Append the stock's DataFrame to the parts list
  parts.append(adj_close)

# Combine the 4 DataFrames into a single DataFrame
adj_close = pd.concat(parts, axis=1)
adj_close

	AAPL	AMZN	GOOGL	MSFT
Date
1980-12-12	0.405683	NaN	NaN	NaN
1980-12-15	0.384517	NaN	NaN	NaN
1980-12-16	0.356296	NaN	NaN	NaN
1980-12-17	0.365115	NaN	NaN	NaN
1980-12-18	0.375698	NaN	NaN	NaN
...	...	...	...	...
2020-05-22	318.890015	2436.879883	1413.239990	183.509995
2020-05-26	316.730011	2421.860107	1421.369995	181.570007
2020-05-27	318.109985	2410.389893	1420.280029	181.809998
2020-05-28	318.250000	2401.100098	1418.239990	NaN
2020-05-29	317.940002	2442.370117	1433.520020	NaN

9950 rows × 4 columns

adj_close = adj_close.dropna()
adj_close.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3970 entries, 2004-08-19 to 2020-05-27
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    3970 non-null   float64
 1   AMZN    3970 non-null   float64
 2   GOOGL   3970 non-null   float64
 3   MSFT    3970 non-null   float64
dtypes: float64(4)
memory usage: 155.1 KB

# Use a sample from June 2019 - May 2020
adj_close_sample = adj_close.loc["2019-06":"2020-05", :]
rebased_prices = adj_close_sample / adj_close_sample.iloc[0, :] * 100
rebased_prices.head(2)

	AAPL	AMZN	GOOGL	MSFT
Date
2019-06-03	100.000000	100.000000	100.00000	100.000000
2019-06-04	103.658406	102.178197	101.51626	102.770372

rebased_prices.plot()

# Correlation of daily log returns
returns = np.log(adj_close / adj_close.shift(1))
returns.corr()

	AAPL	AMZN	GOOGL	MSFT
AAPL	1.000000	0.424910	0.503497	0.486065
AMZN	0.424910	1.000000	0.486690	0.485725
GOOGL	0.503497	0.486690	1.000000	0.525645
MSFT	0.486065	0.485725	0.525645	1.000000

import plotly.express as px

fig = px.imshow(returns.corr(),
                x=adj_close.columns,
                y=adj_close.columns,
                color_continuous_scale=list(
  reversed(px.colors.sequential.RdBu)),
  zmin=-1, zmax=1)
fig.show()

Resampling¶

end_of_month = adj_close.resample("M").last()
end_of_month.head()

	AAPL	AMZN	GOOGL	MSFT
Date
2004-08-31	2.132708	38.139999	51.236237	17.673630
2004-09-30	2.396127	40.860001	64.864868	17.900215
2004-10-31	3.240182	34.130001	95.415413	18.107374
2004-11-30	4.146072	39.680000	91.081078	19.344421
2004-12-31	3.982207	44.290001	96.491493	19.279480

end_of_month.resample("D").asfreq().head()  # No transformation

	AAPL	AMZN	GOOGL	MSFT
Date
2004-08-31	2.132708	38.139999	51.236237	17.67363
2004-09-01	NaN	NaN	NaN	NaN
2004-09-02	NaN	NaN	NaN	NaN
2004-09-03	NaN	NaN	NaN	NaN
2004-09-04	NaN	NaN	NaN	NaN

end_of_month.resample("W-FRI").ffill().head()  # Forward fill

	AAPL	AMZN	GOOGL	MSFT
Date
2004-09-03	2.132708	38.139999	51.236237	17.673630
2004-09-10	2.132708	38.139999	51.236237	17.673630
2004-09-17	2.132708	38.139999	51.236237	17.673630
2004-09-24	2.132708	38.139999	51.236237	17.673630
2004-10-01	2.396127	40.860001	64.864868	17.900215

Rolling Windows¶

# Plot the moving average for MSFT with data from 2019
msft19 = msft.loc["2019", ["Adj Close"]].copy()
# Add the 25 day moving average as a new column to the DataFrame
msft19.loc[:, "25day average"] = msft19["Adj Close"].rolling(25).mean()
msft19.plot()

Ch05 Data Analysis with pandas Ch7 Excel File Manipulation with pandas