Ch08 Excel File Manipulation with Reader and Writer

The Reader and Writer Packages

OpenPyXL

Reading with OpenPyXL

import pandas as pd
import openpyxl
import excel
import datetime as dt
# Open the workbook to read cell values.
# The file is automatically closed again after loading the data.
book = openpyxl.load_workbook("xl/stores.xlsx", data_only=True)
# Get a worksheet object by name or index (0-based)
sheet = book["2019"]
sheet = book.worksheets[0]
# Get a list with all sheet names
book.sheetnames
['2019', '2020', '2019-2020']
# Loop through the sheet objects.
# Instead of "name", openpyxl uses "title".
for i in book.worksheets:
    print(i.title)
2019
2020
2019-2020
# Getting the dimensions,
# i.e. the used range of the sheet
sheet.max_row, sheet.max_column
(8, 6)
# Read the value of a single cell
# using "A1" notation and using cell indices (1-based)
sheet["B6"].value
sheet.cell(row=6, column=2).value
'Boston'
# Read in a range of cell values by using our excel module
data = excel.read(book["2019"], (2, 2), (8, 6))
data[:2]  # Print the first two rows
[['Store', 'Employees', 'Manager', 'Since', 'Flagship'],
 ['New York', 10, 'Sarah', datetime.datetime(2018, 7, 20, 0, 0), False]]

Writing with OpenPyXL

import openpyxl
from openpyxl.drawing.image import Image
from openpyxl.chart import BarChart, Reference
from openpyxl.styles import Font, colors
from openpyxl.styles.borders import Border, Side
from openpyxl.styles.alignment import Alignment
from openpyxl.styles.fills import PatternFill
import excel
# Instantiate a workbook
book = openpyxl.Workbook()

# Get the first sheet and give it a name
sheet = book.active
sheet.title = "Sheet1"

# Writing individual cells using A1 notation
# and cell indices (1-based)
sheet["A1"].value = "Hello 1"
sheet.cell(row=2, column=1, value="Hello 2")

# Formatting: fill color, alignment, border and font
font_format = Font(color="FF0000", bold=True)
thin = Side(border_style="thin", color="FF0000")
sheet["A3"].value = "Hello 3"
sheet["A3"].font = font_format
sheet["A3"].border = Border(top=thin, left=thin,
                            right=thin, bottom=thin)
sheet["A3"].alignment = Alignment(horizontal="center")
sheet["A3"].fill = PatternFill(fgColor="FFFF00", fill_type="solid")

# Number formatting (using Excel's formatting strings)
sheet["A4"].value = 3.3333
sheet["A4"].number_format = "0.00"

# Date formatting (using Excel's formatting strings)
sheet["A5"].value = dt.date(2016, 10, 13)
sheet["A5"].number_format = "mm/dd/yy"

# Formula: you must use the English name of the formula
# with commas as delimiters
sheet["A6"].value = "=SUM(A4, 2)"

# Image
sheet.add_image(Image("images/python.png"), "C1")

# Two-dimensional list (we're using our excel module)
data = [[None, "North", "South"],
        ["Last Year", 2, 5],
        ["This Year", 3, 6]]
excel.write(sheet, data, "A10")

# Chart
chart = BarChart()
chart.type = "col"
chart.title = "Sales Per Region"
chart.x_axis.title = "Regions"
chart.y_axis.title = "Sales"
chart_data = Reference(sheet, min_row=11, min_col=1,
                       max_row=12, max_col=3)
chart_categories = Reference(sheet, min_row=10, min_col=2,
                             max_row=10, max_col=3)
# from_rows interprets the data in the same way
# as if you would add a chart manually in Excel
chart.add_data(chart_data, titles_from_data=True, from_rows=True)
chart.set_categories(chart_categories)
sheet.add_chart(chart, "A15")

# Saving the workbook creates the file on disk
book.save("openpyxl.xlsx")
book = openpyxl.Workbook()
sheet = book.active
sheet["A1"].value = "This is a template"
book.template = True
book.save("template.xltx")

Editing with OpenPyXL

# Read the stores.xlsx file, change a cell
# and store it under a new location/name.
book = openpyxl.load_workbook("xl/stores.xlsx")
book["2019"]["A1"].value = "modified"
book.save("stores_edited.xlsx")
book = openpyxl.load_workbook("xl/macro.xlsm", keep_vba=True)
book["Sheet1"]["A1"].value = "Click the button!"
book.save("macro_openpyxl.xlsm")

XlsxWriter

import datetime as dt
import xlsxwriter
import excel
# Instantiate a workbook
book = xlsxwriter.Workbook("xlxswriter.xlsx")

# Add a sheet and give it a name
sheet = book.add_worksheet("Sheet1")

# Writing individual cells using A1 notation
# and cell indices (0-based)
sheet.write("A1", "Hello 1")
sheet.write(1, 0, "Hello 2")

# Formatting: fill color, alignment, border and font
formatting = book.add_format({"font_color": "#FF0000",
                              "bg_color": "#FFFF00",
                              "bold": True, "align": "center",
                              "border": 1, "border_color": "#FF0000"})
sheet.write("A3", "Hello 3", formatting)

# Number formatting (using Excel's formatting strings)
number_format = book.add_format({"num_format": "0.00"})
sheet.write("A4", 3.3333, number_format)

# Date formatting (using Excel's formatting strings)
date_format = book.add_format({"num_format": "mm/dd/yy"})
sheet.write("A5", dt.date(2016, 10, 13), date_format)

# Formula: you must use the English name of the formula
# with commas as delimiters
sheet.write("A6", "=SUM(A4, 2)")

# Image
sheet.insert_image(0, 2, "images/python.png")

# Two-dimensional list (we're using our excel module)
data = [[None, "North", "South"],
        ["Last Year", 2, 5],
        ["This Year", 3, 6]]
excel.write(sheet, data, "A10")

# Chart: see the file "sales_report_xlsxwriter.py" in the
# companion repo to see how you can work with indices
# instead of cell addresses
chart = book.add_chart({"type": "column"})
chart.set_title({"name": "Sales per Region"})
chart.add_series({"name": "=Sheet1!A11",
                  "categories": "=Sheet1!B10:C10",
                  "values": "=Sheet1!B11:C11"})
chart.add_series({"name": "=Sheet1!A12",
                  "categories": "=Sheet1!B10:C10",
                  "values": "=Sheet1!B12:C12"})
chart.set_x_axis({"name": "Regions"})
chart.set_y_axis({"name": "Sales"})
sheet.insert_chart("A15", chart)

# Closing the workbook creates the file on disk
book.close()
### book = xlsxwriter.Workbook("macro_xlxswriter.xlsm")
sheet = book.add_worksheet("Sheet1")
sheet.write("A1", "Click the button!")
book.add_vba_project("xl/vbaProject.bin")
sheet.insert_button("A3", {"macro": "Hello", "caption": "Button 1",
                           "width": 130, "height": 35})
book.close()

pyxlsb

import pyxlsb
import excel
# Loop through sheets. With pyxlsb, the workbook
# and sheet objects can be used as context managers.
# book.sheets returns a list of sheet names, not objects!
# To get a sheet object, use get_sheet() instead.
with pyxlsb.open_workbook("xl/stores.xlsb") as book:
    for sheet_name in book.sheets:
        with book.get_sheet(sheet_name) as sheet:
            dim = sheet.dimension
            print(f"Sheet '{sheet_name}' has " 
                  f"{dim.h} rows and {dim.w} cols")
Sheet '2019' has 7 rows and 5 cols
Sheet '2020' has 7 rows and 5 cols
Sheet '2019-2020' has 20 rows and 5 cols
# Read in the values of a range of cells by using our excel module.
# Instead of "2019", you could also use its index (1-based).
with pyxlsb.open_workbook("xl/stores.xlsb") as book:
    with book.get_sheet("2019") as sheet:
        data = excel.read(sheet, "B2")
data[:2]  # Print the first two rows
from pyxlsb import convert_date
convert_date(data[1][3])
df = pd.read_excel("xl/stores.xlsb", engine="pyxlsb")

xlrd, xlwt and xlutils

Reading with xlrd

import xlrd
import xlwt
from xlwt.Utils import cell_to_rowcol2
import xlutils
import excel
# Open the workbook to read cell values. The file is
# automatically closed again after loading the data.
book = xlrd.open_workbook("xl/stores.xls")
# Get a list with all sheet names
book.sheet_names()
['2019', '2020', '2019-2020']
# Loop through the sheet objects
for sheet in book.sheets():
    print(sheet.name)
2019
2020
2019-2020
# Get a sheet object by name or index (0-based)
sheet = book.sheet_by_index(0)
sheet = book.sheet_by_name("2019")
# Dimensions
sheet.nrows, sheet.ncols
(8, 6)
# Read the value of a single cell
# using "A1" notation and using cell indices (0-based).
# The "*" unpacks the tuple that cell_to_rowcol2 returns 
# into individual arguments.
sheet.cell(*cell_to_rowcol2("B3")).value
sheet.cell(2, 1).value
'New York'
# Read in a range of cell values by using our excel module
data = excel.read(sheet, "B2")
data[:2]  # Print the first two rows
[['Store', 'Employees', 'Manager', 'Since', 'Flagship'],
 ['New York', 10.0, 'Sarah', datetime.datetime(2018, 7, 20, 0, 0), False]]

Writing with xlwt

import xlwt
from xlwt.Utils import cell_to_rowcol2
import datetime as dt
import excel
# Instantiate a workbook
book = xlwt.Workbook()

# Add a sheet and give it a name
sheet = book.add_sheet("Sheet1")

# Writing individual cells using A1 notation
# and cell indices (0-based)
sheet.write(*cell_to_rowcol2("A1"), "Hello 1")
sheet.write(r=1, c=0, label="Hello 2")

# Formatting: fill color, alignment, border and font
formatting = xlwt.easyxf("font: bold on, color red;"
                         "align: horiz center;"
                         "borders: top_color red, bottom_color red,"
                                  "right_color red, left_color red,"
                                  "left thin, right thin,"
                                  "top thin, bottom thin;"
                         "pattern: pattern solid, fore_color yellow;")
sheet.write(r=2, c=0, label="Hello 3", style=formatting)

# Number formatting (using Excel's formatting strings)
number_format = xlwt.easyxf(num_format_str="0.00")
sheet.write(3, 0, 3.3333, number_format)

# Date formatting (using Excel's formatting strings)
date_format = xlwt.easyxf(num_format_str="mm/dd/yyyy")
sheet.write(4, 0, dt.datetime(2012, 2, 3), date_format)

# Formula: you must use the English name of the formula
# with commas as delimiters
sheet.write(5, 0, xlwt.Formula("SUM(A4, 2)"))

# Two-dimensional list (we're using our excel module)
data = [[None, "North", "South"],
        ["Last Year", 2, 5],
        ["This Year", 3, 6]]
excel.write(sheet, data, "A10")

# Picture (only allows to add bmp format)
sheet.insert_bitmap("images/python.bmp", 0, 2)

# This writes the file to disk
book.save("xlwt.xls")

Editing with xlutils

import xlutils.copy
book = xlrd.open_workbook("xl/stores.xls", formatting_info=True)
book = xlutils.copy.copy(book)
book.get_sheet(0).write(0, 0, "changed!")
book.save("stores_edited.xls")

Working with Big Files

Writing with OpenPyXL

book = openpyxl.Workbook(write_only=True)
# With write_only=True, book.active doesn't work
sheet = book.create_sheet()
# This will produce a sheet with 1000 x 200 cells
for row in range(1000):
    sheet.append(list(range(200)))
book.save("openpyxl_optimized.xlsx")

Writing with XlsxWriter

book = xlsxwriter.Workbook("xlsxwriter_optimized.xlsx",
                           options={"constant_memory": True})
sheet = book.add_worksheet()
# This will produce a sheet with 1000 x 200 cells
for row in range(1000):
    sheet.write_row(row , 0, list(range(200)))
book.close()

Reading with xlrd

with xlrd.open_workbook("xl/stores.xls", on_demand=True) as book:
    sheet = book.sheet_by_index(0)  # Only loads the first sheet
with xlrd.open_workbook("xl/stores.xls", on_demand=True) as book:
    with pd.ExcelFile(book, engine="xlrd") as f:
        df = pd.read_excel(f, sheet_name=0)

Reading with OpenPyXL

book = openpyxl.load_workbook("xl/big.xlsx",
                              data_only=True, read_only=True,
                              keep_links=False)
# Perform the desired read operations here
book.close()  # Required with read_only=True

Reading in Parallel

%%time
data = pd.read_excel("xl/big.xlsx",
                     sheet_name=None, engine="openpyxl")
%%time
import parallel_pandas
data = parallel_pandas.read_excel("xl/big.xlsx", sheet_name=None)

Formatting DataFrames in Excel

with pd.ExcelFile("xl/stores.xlsx", engine="openpyxl") as xlfile:
    # Read a DataFrame
    df = pd.read_excel(xlfile, sheet_name="2020")

    # Get the OpenPyXL workbook object
    book = xlfile.book

    # From here on, it's OpenPyXL code
    sheet = book["2019"]
    value = sheet["B3"].value  # Read a single value
with pd.ExcelWriter("pandas_and_openpyxl.xlsx",
                    engine="openpyxl") as writer:
    df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": [5, 6, 7, 8]})
    # Write a DataFrame
    df.to_excel(writer, "Sheet1", startrow=4, startcol=2)

    # Get the OpenPyXL workbook and sheet objects
    book = writer.book
    sheet = writer.sheets["Sheet1"]

    # From here on, it's OpenPyXL code
    sheet["A1"].value = "This is a Title"  # Write a single cell value
df = pd.DataFrame({"col1": [1, -2], "col2": [-3, 4]},
                   index=["row1", "row2"])
df.index.name = "ix"
df
from openpyxl.styles import PatternFill
with pd.ExcelWriter("formatting_openpyxl.xlsx",
                    engine="openpyxl") as writer:
    # Write out the df with the default formatting to A1
    df.to_excel(writer, startrow=0, startcol=0)

    # Write out the df with custom index/header formatting to A6    
    startrow, startcol = 0, 5
    # 1. Write out the data part of the DataFrame
    df.to_excel(writer, header=False, index=False,
                startrow=startrow + 1, startcol=startcol + 1)
    # Get the sheet object and create a style object
    sheet = writer.sheets["Sheet1"]
    style = PatternFill(fgColor="D9D9D9", fill_type="solid")

    # 2. Write out the styled column headers
    for i, col in enumerate(df.columns):
        sheet.cell(row=startrow + 1, column=i + startcol + 2,
                   value=col).fill = style

    # 3. Write out the styled index
    index = [df.index.name if df.index.name else None] + list(df.index)
    for i, row in enumerate(index):
        sheet.cell(row=i + startrow + 1, column=startcol + 1,
                   value=row).fill = style
# Formatting index/headers with XlsxWriter
with pd.ExcelWriter("formatting_xlsxwriter.xlsx",
                    engine="xlsxwriter") as writer:
    # Write out the df with the default formatting to A1
    df.to_excel(writer, startrow=0, startcol=0)

    # Write out the df with custom index/header formatting to A6
    startrow, startcol = 0, 5
    # 1. Write out the data part of the DataFrame
    df.to_excel(writer, header=False, index=False,
                startrow=startrow + 1, startcol=startcol + 1)
    # Get the book and sheet object and create a style object
    book = writer.book
    sheet = writer.sheets["Sheet1"]
    style = book.add_format({"bg_color": "#D9D9D9"})

    # 2. Write out the styled column headers
    for i, col in enumerate(df.columns):
        sheet.write(startrow, startcol + i + 1, col, style)

    # 3. Write out the styled index
    index = [df.index.name if df.index.name else None] + list(df.index)
    for i, row in enumerate(index):
        sheet.write(startrow + i, startcol, row, style)
from openpyxl.styles import Alignment
with pd.ExcelWriter("data_format_openpyxl.xlsx",
                    engine="openpyxl") as writer:
    # Write out the DataFrame
    df.to_excel(writer)
    
    # Get the book and sheet objects
    book = writer.book
    sheet = writer.sheets["Sheet1"]
    
    # Formatting individual cells
    nrows, ncols = df.shape
    for row in range(nrows):
        for col in range(ncols):
            # +1 to account for the header/index
            # +1 since OpenPyXL is 1-based
            cell = sheet.cell(row=row + 2,
                              column=col + 2)
            cell.number_format = "0.000"
            cell.alignment = Alignment(horizontal="center")
with pd.ExcelWriter("data_format_xlsxwriter.xlsx",
                    engine="xlsxwriter") as writer:
    # Write out the DataFrame
    df.to_excel(writer)

    # Get the book and sheet objects    
    book = writer.book
    sheet = writer.sheets["Sheet1"]
    
    # Formatting the columns (individual cells can't be formatted)
    number_format = book.add_format({"num_format": "0.000",
                                     "align": "center"})
    sheet.set_column(first_col=1, last_col=2,
                     cell_format=number_format)
df.style.applymap(lambda x: "number-format: 0.000;"
                            "text-align: center")\
        .to_excel("styled.xlsx")
df = pd.DataFrame({"Date": [dt.date(2020, 1, 1)],
                   "Datetime": [dt.datetime(2020, 1, 1, 10)]})
with pd.ExcelWriter("date.xlsx",
                    date_format="yyyy-mm-dd",
                    datetime_format="yyyy-mm-dd hh:mm:ss") as writer:
    df.to_excel(writer)