Skip to content

Commit

Permalink
testing automation; unfinished
Browse files Browse the repository at this point in the history
  • Loading branch information
zzeppozz committed Sep 12, 2024
1 parent 5232419 commit 50593d4
Show file tree
Hide file tree
Showing 8 changed files with 255 additions and 183 deletions.
21 changes: 21 additions & 0 deletions _sphinx_config/pages/aws/aws_setup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,35 @@ The :ref:`_bison_redshift_s3_role` allows Redshift to access public S3 data and
the bison S3 bucket, and allows Redshift to perform glue functions. Its trust
relationship grants AssumeRole to redshift service.

Make sure that the same role granted to the namespace is used for creating an external
schema and lambda functions. When mounting external data as a redshift table to the
external schema, you may encounter an error indicating that the "dev" database does not
exist. This refers to the external database, and may indicate that the role used by the
command and/or namespace differs from the role granted to the schema upon creation.

Redshift Namespace and Workgroup
===========================================================

Namespace and Workgroup
------------------------------

A namespace is storage-related, with database objects and users. A workspace is
a collection of compute resources such as security groups and other properties and
limitations.
https://docs.aws.amazon.com/redshift/latest/mgmt/serverless-workgroup-namespace.html

External Schema
------------------------
The command below creates an external schema, redshift_spectrum, and also creates a
**new** external database "dev". It appears in the console to be the same "dev"
database that contains the public schema, but it is separate. Also note the IAM role
used to create the schema must match the role attached to the namespace::

CREATE external schema redshift_spectrum
FROM data catalog
DATABASE dev
IAM_ROLE 'arn:aws:iam::321942852011:role/bison_redshift_s3_role'
CREATE external database if NOT exists;

EC2 instance creation
===========================================================
Expand Down
17 changes: 16 additions & 1 deletion _sphinx_config/pages/aws/roles.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,30 @@ bison_redshift_s3_role

* Policies:

* AmazonRedshiftAllCommandsFullAccess (AWS managed)
* AmazonRedshiftDataFullAccess (AWS managed)
* AmazonRedshiftFullAccess (AWS managed)
* bison_s3_policy (read public/GBIF S3 data and read/write bison S3 data)
* redshift_glue_policy.json (for Redshift interactions)
* AmazonRedshiftAllCommandsFullAccess (AWS managed)

* AmazonS3FullAccess (AWS managed)

* Trust policy:

*

bison_redshift_lambda_role
------------------------

Attach to BISON lambda functions

* AmazonRedshiftAllCommandsFullAccess (AWS managed)
* AmazonRedshiftDataFullAccess (AWS managed)
* AmazonRedshiftFullAccess (AWS managed)
* bison_lambda_log_policy (write CloudWatch logs to bison log groups)
TODO: add new log group for each lambda function
* bison_s3_policy (read public/GBIF S3 data and read/write bison S3 data)

.. _bison_ec2_s3_role:

bison_ec2_s3_role
Expand Down
53 changes: 19 additions & 34 deletions aws/events/bison_mount_gbif_lambda.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json
import boto3
import botocore
import botocore.session as bc
from botocore.client import Config
from datetime import datetime
Expand All @@ -13,6 +12,8 @@
database = "dev"
dbuser = "IAM:aimee.stewart"
dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux"
pub_schema = "public"
external_schema = "redshift_spectrum"
timeout = 900
waittime = 5

Expand All @@ -21,8 +22,6 @@
gbif_datestr = f"{datetime.now().year}-{datetime.now().month:02d}-01"
parquet_key = f"occurrence/{gbif_datestr}/occurrence.parquet"
bison_datestr = gbif_datestr.replace("-", "_")
pub_schema = "public"
external_schema = "redshift_spectrum"

gbif_odr_data = f"s3://{gbif_bucket}/{parquet_key}/"
mounted_gbif_name = f"{external_schema}.occurrence_{bison_datestr}_parquet"
Expand Down Expand Up @@ -103,20 +102,6 @@ class VARCHAR(max),
('HUMAN_OBSERVATION', 'OBSERVATION', 'OCCURRENCE', 'PRESERVED_SPECIMEN');
"""

list_external_tables_stmt = f"""
SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
WHERE cls.relnamespace = ns.oid
AND schemaname = '{external_schema}';
"""

list_public_tables_stmt = f"""
SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
WHERE cls.relnamespace = ns.oid
AND schemaname = '{pub_schema}';
"""

count_gbif_stmt = f"SELECT COUNT(*) from {mounted_gbif_name};"
count_bison_stmt = f"SELECT COUNT(*) FROM {subset_bison_name};"
unmount_stmt = f"DROP TABLE {mounted_gbif_name};"
Expand Down Expand Up @@ -145,8 +130,8 @@ def lambda_handler(event, context):
# Submit query request
try:
submit_result = client_redshift.execute_statement(
WorkgroupName=workgroup, Database=database, Sql=list_public_tables_stmt)
print(f"*** {mounted_gbif_name} mount successfully executed")
WorkgroupName=workgroup, Database=database, Sql=mount_stmt)
print(f"*** Mount command submitted")

except Exception as e:
raise Exception(e)
Expand All @@ -157,7 +142,7 @@ def lambda_handler(event, context):
print(f"*** {k} = {v}")

# -------------------------------------
# Loop til complete
# Loop til complete, then describe result
elapsed_time = 0
complete = False
while not complete and elapsed_time < 300:
Expand All @@ -178,20 +163,20 @@ def lambda_handler(event, context):
print(f"Failed to describe_statement {e}")
complete = True

# # -------------------------------------
# # Get statement output for query
try:
stmt_result = client_redshift.get_statement_result(Id=submit_id)
except Exception as e:
print(f"Failed to get_statement_result {e}")
else:
print("*** statement_result records")
try:
records = stmt_result["Records"]
for rec in records:
print(f"*** {rec}")
except Exception as e:
print(f"Failed to return records ({e})")
# # # -------------------------------------
# # # IFF query, get statement output
# try:
# stmt_result = client_redshift.get_statement_result(Id=submit_id)
# except Exception as e:
# print(f"*** No get_statement_result {e}")
# else:
# print("*** get_statement_result records")
# try:
# records = stmt_result["Records"]
# for rec in records:
# print(f"*** {rec}")
# except Exception as e:
# print(f"Failed to return records ({e})")

return {
'statusCode': 200,
Expand Down
119 changes: 119 additions & 0 deletions aws/events/bison_query_lambda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import json
import boto3
import botocore.session as bc
from botocore.client import Config
from datetime import datetime
import time

print('Loading function')

region = "us-east-1"
workgroup = "bison"
database = "dev"
dbuser = "IAM:aimee.stewart"
dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux"
dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux"
bison_bucket = 'bison-321942852011-us-east-1'
timeout = 900
waittime = 2

# Define the public bucket and file to query
gbif_bucket = f"gbif-open-data-{region}"
gbif_datestr = f"{datetime.now().year}-{datetime.now().month:02d}-01"
parquet_key = f"occurrence/{gbif_datestr}/occurrence.parquet"
bison_datestr = gbif_datestr.replace("-", "_")
pub_schema = "public"
external_schema = "redshift_spectrum"

gbif_odr_data = f"s3://{gbif_bucket}/{parquet_key}/"
mounted_gbif_name = f"{external_schema}.occurrence_{bison_datestr}_parquet"
subset_bison_name = f"{pub_schema}.bison_{bison_datestr}"

list_external_tables_stmt = f"""
SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
WHERE cls.relnamespace = ns.oid
AND schemaname = '{external_schema}';
"""

list_public_tables_stmt = f"""
SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
WHERE cls.relnamespace = ns.oid
AND schemaname = '{pub_schema}';
"""

count_gbif_stmt = f"SELECT COUNT(*) from {mounted_gbif_name};"
count_bison_stmt = f"SELECT COUNT(*) FROM {subset_bison_name};"

# Initializing Botocore client
session = boto3.session.Session()
bc_session = bc.get_session()
session = boto3.Session(
botocore_session=bc_session,
region_name=region
)

# Initializing Redshift's client
config = Config(connect_timeout=timeout, read_timeout=timeout)
client_redshift = session.client("redshift-data", config=config)

# -----------------------------------------------------
def lambda_handler(event, context):
print("*** Entered lambda_handler")
# -------------------------------------
# Submit query request
try:
submit_result = client_redshift.execute_statement(
WorkgroupName=workgroup, Database=database, Sql=list_public_tables_stmt)
print(f"*** Mount command submitted")

except Exception as e:
raise Exception(e)

submit_id = submit_result['Id']
print(f"*** submit id = {submit_id}")
for k, v in submit_result.items():
print(f"*** {k} = {v}")

# -------------------------------------
# Loop til complete, then describe result
elapsed_time = 0
complete = False
while not complete and elapsed_time < 300:
try:
describe_result = client_redshift.describe_statement(Id=submit_id)
status = describe_result["Status"]
print(f"*** Query Status - {status} after {elapsed_time} seconds")
if status in ("ABORTED", "FAILED", "FINISHED"):
complete = True
desc_id = describe_result['Id']
print(f"*** desc id = {desc_id}")
for k, v in describe_result.items():
print(f"*** {k} = {v}")
else:
time.sleep(waittime)
elapsed_time += waittime
except Exception as e:
print(f"Failed to describe_statement {e}")
complete = True

# -------------------------------------
# IFF query, get statement output
try:
stmt_result = client_redshift.get_statement_result(Id=submit_id)
except Exception as e:
print(f"*** No get_statement_result {e}")
else:
print("*** get_statement_result records")
try:
records = stmt_result["Records"]
for rec in records:
print(f"*** {rec}")
except Exception as e:
print(f"Failed to return records ({e})")

return {
'statusCode': 200,
'body': json.dumps(f"Lambda result logged")
}
Loading

0 comments on commit 50593d4

Please sign in to comment.