Google Sheets minimal example
info
The source code for this example can be found in our repository at: https://github.com/dlt-hub/dlt/tree/devel/docs/examples/google_sheets.
TLDR
This example demonstrates how to load Google Sheets data using Python and the dlt library. It covers working with Google API, using built in credentials, using union of credentials, and creating dynamically generated resources.
Setup: Running this example on your machine
# clone the dlt repository
git clone git@github.com:dlt-hub/dlt.git
# go to example directory
cd ./dlt/docs/examples/google_sheets
# install dlt with duckdb
pip install "dlt[duckdb]"
# run the example script
python google_sheets.py
Google Sheets data pipeline
In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the dlt
library.
We'll learn how to:
- use built-in credentials;
- use union of credentials;
- create dynamically generated resources.
tip
This example is for educational purposes. For best practices, we recommend using Google Sheets verified source.
Install Google client library
pip install google-api-python-client
Loading code
from typing import Any, Iterator, Sequence, Union, cast
from googleapiclient.discovery import build
import dlt
from dlt.common.configuration.specs import (
GcpOAuthCredentials,
GcpServiceAccountCredentials,
)
from dlt.common.typing import DictStrAny, StrAny
def _initialize_sheets(
credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials]
) -> Any:
# Build the service object.
service = build("sheets", "v4", credentials=credentials.to_native_credentials())
return service
@dlt.source
def google_spreadsheet(
spreadsheet_id: str,
sheet_names: Sequence[str],
credentials: Union[
GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny
] = dlt.secrets.value,
) -> Any:
sheets = _initialize_sheets(cast(GcpServiceAccountCredentials, credentials))
def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
# get list of list of typed values
result = (
sheets.spreadsheets()
.values()
.get(
spreadsheetId=spreadsheet_id,
range=sheet_name,
# unformatted returns typed values
valueRenderOption="UNFORMATTED_VALUE",
# will return formatted dates
dateTimeRenderOption="FORMATTED_STRING",
)
.execute()
)
# pprint.pprint(result)
values = result.get("values")
# yield dicts assuming row 0 contains headers and following rows values and all rows have identical length
for v in values[1:]:
yield {h: v for h, v in zip(values[0], v)}
# create resources from supplied sheet names
return [
dlt.resource(get_sheet(name), name=name, write_disposition="replace")
for name in sheet_names
]
Run the pipeline
if __name__ == "__main__":
pipeline = dlt.pipeline(destination="duckdb")
# see example.secrets.toml to where to put credentials
sheet_id = "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"
range_names = ["hidden_columns_merged_cells", "Blank Columns"]
# "2022-05", "model_metadata"
info = pipeline.run(
google_spreadsheet(
spreadsheet_id=sheet_id,
sheet_names=range_names,
)
)
print(info)