Skip to content

Commit

Permalink
Add getting started docker compose for Jupyter with Spark(#295)
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinjqliu authored Oct 15, 2024
1 parent a8fe751 commit ac01e2d
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/check-md-link.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ jobs:
with:
use-quiet-mode: 'yes'
config-file: '.github/workflows/check-md-link-config.json'
folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, notebooks'
folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, getting-started'
file-path: 'CHAT_BYLAWS.md, CODE_OF_CONDUCT.md, CONTRIBUTING.md, README.md SECURITY.md'
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ regtests/output/
/polaris-venv/
/pyproject.toml

# Notebooks
notebooks/.ipynb_checkpoints/
# Notebook Checkpoints
**/.ipynb_checkpoints/

# Metastore
metastore_db/
Expand Down
45 changes: 45 additions & 0 deletions getting-started/spark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

# Getting Started with Apache Spark and Apache Polaris

This getting started guide provides a `docker-compose` file to set up [Apache Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is configured as an Iceberg REST Catalog in Spark.
A Jupyter notebook is used to run PySpark.

## Run the `docker-compose` file
To start the `docker-compose` file, run this command from the repo's root directory:
```
docker-compose -f getting-started/spark/docker-compose.yml up
```

This will spin up 2 container services
* The `polaris` service for running Apache Polaris using an in-memory metastore
* The `jupyter` service for running Jupyter notebook with PySpark

## Access the Jupyter notebook interface
In the Jupyter notebook container log, look for the URL to access the Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=<token>`.

Open the Jupyter notebook in a browser.
Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) <!-- markdown-link-check-disable-line -->

## Change the Polaris credential
The Polaris service will create a new root crendential on startup, find this credential in the Polaris service log and change the `polaris_credential` variable in the first cell of the jupyter notebook

## Run the Jupyter notebook
You can now run all cells in the notebook or write your own code!
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
services:
polaris:
build:
context: .
context: ../../
network: host
ports:
- "8181:8181"
Expand All @@ -37,8 +37,8 @@ services:
retries: 5
jupyter:
build:
context: .
dockerfile: ./notebooks/Dockerfile
context: ../../ # this is necessary to expose `regtests/` dir to notebooks/Dockerfile
dockerfile: ./getting-started/spark/notebooks/Dockerfile
network: host
ports:
- "8888:8888"
Expand All @@ -57,4 +57,4 @@ volumes:
driver_opts:
o: bind
type: none
device: ./notebooks
device: ./notebooks/
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@
"from polaris.catalog.api_client import ApiClient as CatalogApiClient\n",
"from polaris.catalog.api_client import Configuration as CatalogApiClientConfiguration\n",
"\n",
"client_id = 'b3b6497353b33ea7'\n",
"client_secret = '623a67ee71d75825238e3e269df5cdac' # pragma: allowlist secret\n",
"# (CHANGE ME): This credential changes on every Polaris service restart\n",
"# In the Polaris log, look for the `realm: default-realm root principal credentials:` string\n",
"polaris_credential = '35df9f8a34199df0:101b9d35700032416210ad2d39b1b4e3' # pragma: allowlist secret\n",
"\n",
"client_id, client_secret = polaris_credential.split(\":\")\n",
"client = CatalogApiClient(CatalogApiClientConfiguration(username=client_id,\n",
" password=client_secret,\n",
" host='http://polaris:8181/api/catalog'))\n",
Expand All @@ -42,8 +45,7 @@
"source": [
"# Create our first catalog\n",
"\n",
"* Creates a catalog named `polaris_catalog` that writes to a specified location in S3.\n",
"* An AWS IAM role is specified - this role is assumed whenever we read or write data in the catalog"
"* Creates a catalog named `polaris_catalog` that writes to a specified location in the Local Filesystem."
]
},
{
Expand All @@ -59,11 +61,9 @@
" host='http://polaris:8181/api/management/v1'))\n",
"root_client = PolarisDefaultApi(client)\n",
"\n",
"storage_conf = AwsStorageConfigInfo(storage_type=\"S3\",\n",
" allowed_locations=[\"s3://datalake-storage-team/polaris_test/\"],\n",
" role_arn=\"arn:aws:iam::631484165566:role/datalake-storage-integration-role\")\n",
"storage_conf = FileStorageConfigInfo(storage_type=\"FILE\", allowed_locations=[\"file:///tmp\"])\n",
"catalog_name = 'polaris_demo'\n",
"catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"s3://datalake-storage-team/polaris_test/polaris_catalog\"},\n",
"catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"file:///tmp/polaris/\"},\n",
" storage_config_info=storage_conf)\n",
"catalog.storage_config_info = storage_conf\n",
"root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n",
Expand Down Expand Up @@ -272,7 +272,7 @@
" .config(\"spark.sql.catalog.polaris.credential\", f\"{engineer_principal.credentials.client_id}:{engineer_principal.credentials.client_secret}\")\n",
"\n",
" # Set the warehouse to the name of the catalog we created\n",
" .config(\"spark.sql.catalog.polaris.warehouse\", 'polaris_demo')\n",
" .config(\"spark.sql.catalog.polaris.warehouse\", catalog_name)\n",
"\n",
" # Scope set to PRINCIPAL_ROLE:ALL\n",
" .config(\"spark.sql.catalog.polaris.scope\", 'PRINCIPAL_ROLE:ALL')\n",
Expand Down Expand Up @@ -454,7 +454,7 @@
" return codecs.decode(\"1F\", \"hex\").decode(\"UTF-8\").join(namespace)\n",
"\n",
"# Call loadTable\n",
"tbl_meta = collado_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"tbl_meta = collado_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"display(JSON(tbl_meta.to_dict(), expanded=True))"
]
},
Expand Down Expand Up @@ -604,7 +604,7 @@
},
"outputs": [],
"source": [
"tbl_meta = pm_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"tbl_meta = pm_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
"display(JSON(tbl_meta.to_dict(), expanded=True))"
]
},
Expand Down Expand Up @@ -632,7 +632,7 @@
},
"outputs": [],
"source": [
"pm_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
"pm_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
]
},
{
Expand Down Expand Up @@ -775,7 +775,7 @@
"# The ops_client fails to do any real damage even though the engineer normally has DROP_TABLE privileges\n",
"ops_client = IcebergCatalogAPI(CatalogApiClient(CatalogApiClientConfiguration(access_token=ops_token.access_token,\n",
" host='http://polaris:8181/api/catalog')))\n",
"ops_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
"ops_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
]
}
],
Expand Down

0 comments on commit ac01e2d

Please sign in to comment.