{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Ch7 Excel File Manipulation with pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Case Study: Excel Reporting" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 9493 entries, 0 to 9492\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 transaction_id 9493 non-null object \n", " 1 store 9493 non-null object \n", " 2 status 9493 non-null object \n", " 3 transaction_date 9493 non-null datetime64[ns]\n", " 4 plan 9493 non-null object \n", " 5 contract_type 9493 non-null object \n", " 6 amount 9493 non-null float64 \n", "dtypes: datetime64[ns](1), float64(1), object(5)\n", "memory usage: 519.3+ KB\n" ] } ], "source": [ "df = pd.read_excel(\"sales_data/new/January.xlsx\")\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading Excel Files with pandas" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StoreEmployeesManagerSinceFlagship
0New York10Sarah2018-07-20False
1San Francisco12Neriah2019-11-02MISSING
2Chicago4Katelin2020-01-31NaN
3Boston5Georgiana2017-04-01True
4Washington DC3EvanNaTFalse
5Las Vegas11Paul2020-01-06False
\n", "
" ], "text/plain": [ " Store Employees Manager Since Flagship\n", "0 New York 10 Sarah 2018-07-20 False\n", "1 San Francisco 12 Neriah 2019-11-02 MISSING\n", "2 Chicago 4 Katelin 2020-01-31 NaN\n", "3 Boston 5 Georgiana 2017-04-01 True\n", "4 Washington DC 3 Evan NaT False\n", "5 Las Vegas 11 Paul 2020-01-06 False" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_excel(\"xl/stores.xlsx\",\n", " sheet_name=\"2019\", skiprows=1, usecols=\"B:F\")\n", "df" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6 entries, 0 to 5\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Store 6 non-null object \n", " 1 Employees 6 non-null int64 \n", " 2 Manager 6 non-null object \n", " 3 Since 5 non-null datetime64[ns]\n", " 4 Flagship 5 non-null object \n", "dtypes: datetime64[ns](1), int64(1), object(3)\n", "memory usage: 368.0+ bytes\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def fix_missing(x):\n", " return False if x in [\"\", \"MISSING\"] else x" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StoreEmployeesManagerSinceFlagship
0New York10Sarah2018-07-20False
1San Francisco12Neriah2019-11-02False
2Chicago4Katelin2020-01-31False
3Boston5Georgiana2017-04-01True
4Washington DC3EvanNaTFalse
5Las Vegas11Paul2020-01-06False
\n", "
" ], "text/plain": [ " Store Employees Manager Since Flagship\n", "0 New York 10 Sarah 2018-07-20 False\n", "1 San Francisco 12 Neriah 2019-11-02 False\n", "2 Chicago 4 Katelin 2020-01-31 False\n", "3 Boston 5 Georgiana 2017-04-01 True\n", "4 Washington DC 3 Evan NaT False\n", "5 Las Vegas 11 Paul 2020-01-06 False" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_excel(\"xl/stores.xlsx\",\n", " sheet_name=\"2019\", skiprows=1, usecols=\"B:F\",\n", " converters={\"Flagship\": fix_missing})\n", "df" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6 entries, 0 to 5\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Store 6 non-null object \n", " 1 Employees 6 non-null int64 \n", " 2 Manager 6 non-null object \n", " 3 Since 5 non-null datetime64[ns]\n", " 4 Flagship 6 non-null bool \n", "dtypes: bool(1), datetime64[ns](1), int64(1), object(2)\n", "memory usage: 326.0+ bytes\n" ] } ], "source": [ "# The Flagship column now has Dtype \"bool\"\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StoreEmployees
0New York10
1San Francisco12
\n", "
" ], "text/plain": [ " Store Employees\n", "0 New York 10\n", "1 San Francisco 12" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sheets = pd.read_excel(\"xl/stores.xlsx\", sheet_name=[\"2019\", \"2020\"],\n", " skiprows=1, usecols=[\"Store\", \"Employees\"])\n", "sheets[\"2019\"].head(2)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BranchEmployee_CountIs_Flagship
0New York10False
1San Francisco12MISSING
2Chicago4NaN
\n", "
" ], "text/plain": [ " Branch Employee_Count Is_Flagship\n", "0 New York 10 False\n", "1 San Francisco 12 MISSING\n", "2 Chicago 4 NaN" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_excel(\"xl/stores.xlsx\", sheet_name=0,\n", " skiprows=2, skipfooter=3,\n", " usecols=\"B:C,F\", header=None,\n", " names=[\"Branch\", \"Employee_Count\", \"Is_Flagship\"])\n", "df" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StoreEmployeesFlagship
0New York10False
1San Francisco12NaN
2Chicago4
3Boston5True
\n", "
" ], "text/plain": [ " Store Employees Flagship\n", "0 New York 10 False\n", "1 San Francisco 12 NaN\n", "2 Chicago 4 \n", "3 Boston 5 True" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_excel(\"xl/stores.xlsx\", sheet_name=\"2019\",\n", " skiprows=1, usecols=\"B,C,F\", skipfooter=2,\n", " na_values=\"MISSING\", keep_default_na=False)\n", "df" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "f = open(\"output.txt\", \"w\")\n", "f.write(\"Some text\")\n", "f.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Context Managers and the with Statement" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "with open(\"output.txt\", \"w\") as f:\n", " f.write(\"Some text\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StoreEmployeesManagerSinceFlagship
0New York10Sarah2018-07-20False
1San Francisco12Neriah2019-11-02MISSING
\n", "
" ], "text/plain": [ " Store Employees Manager Since Flagship\n", "0 New York 10 Sarah 2018-07-20 False\n", "1 San Francisco 12 Neriah 2019-11-02 MISSING" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with pd.ExcelFile(\"xl/stores.xls\") as f:\n", " df1 = pd.read_excel(f, \"2019\", skiprows=1, usecols=\"B:F\", nrows=2)\n", " df2 = pd.read_excel(f, \"2020\", skiprows=1, usecols=\"B:F\", nrows=2)\n", "\n", "df1" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['2019', '2020', '2019-2020']" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stores = pd.ExcelFile(\"xl/stores.xlsx\")\n", "stores.sheet_names" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StoreEmployeesManagerSince
0New York10Sarah2018-07-20
1San Francisco12Neriah2019-11-02
\n", "
" ], "text/plain": [ " Store Employees Manager Since\n", "0 New York 10 Sarah 2018-07-20\n", "1 San Francisco 12 Neriah 2019-11-02" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = (\"https://raw.githubusercontent.com/fzumstein/\"\n", " \"python-for-excel/1st-edition/xl/stores.xlsx\")\n", "pd.read_excel(url, skiprows=1, usecols=\"B:E\", nrows=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Writing Excel Files with pandas" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import datetime as dt" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatesFloatsIntegersBooleans
index
02020-01-01 10:13:002.2221True
12020-01-02 00:00:00NaN2False
22020-01-02 00:00:00inf3True
\n", "
" ], "text/plain": [ " Dates Floats Integers Booleans\n", "index \n", "0 2020-01-01 10:13:00 2.222 1 True\n", "1 2020-01-02 00:00:00 NaN 2 False\n", "2 2020-01-02 00:00:00 inf 3 True" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = [[dt.datetime(2020, 1, 1, 10, 13), 2.222, 1, True],\n", " [dt.datetime(2020, 1, 2), np.nan, 2, False],\n", " [dt.datetime(2020, 1, 2), np.inf, 3, True]]\n", "df = pd.DataFrame(data=data,\n", " columns=[\"Dates\", \"Floats\", \"Integers\", \"Booleans\"])\n", "df.index.name = \"index\"\n", "df" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "df.to_excel(\"written_with_pandas.xlsx\", sheet_name=\"Output\",\n", " startrow=1, startcol=1, index=True, header=True,\n", " na_rep=\"\", inf_rep=\"\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "with pd.ExcelWriter(\"written_with_pandas2.xlsx\") as writer:\n", " df.to_excel(writer, sheet_name=\"Sheet1\", startrow=1, startcol=1)\n", " df.to_excel(writer, sheet_name=\"Sheet1\", startrow=10, startcol=1)\n", " df.to_excel(writer, sheet_name=\"Sheet2\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }