testing automation; unfinished

lifemapper · Sep 12, 2024 · 50593d4 · 50593d4
1 parent 5232419
commit 50593d4
Show file tree

Hide file tree

Showing 8 changed files with 255 additions and 183 deletions.
diff --git a/_sphinx_config/pages/aws/aws_setup.rst b/_sphinx_config/pages/aws/aws_setup.rst
@@ -13,14 +13,35 @@ The :ref:`_bison_redshift_s3_role` allows Redshift to access public S3 data and
 the bison S3 bucket, and allows Redshift to perform glue functions. Its trust
 relationship grants AssumeRole to redshift service.
 
+Make sure that the same role granted to the namespace is used for creating an external
+schema and lambda functions.  When mounting external data as a redshift table to the
+external schema, you may encounter an error indicating that the "dev" database does not
+exist.  This refers to the external database, and may indicate that the role used by the
+command and/or namespace differs from the role granted to the schema upon creation.
+
 Redshift Namespace and Workgroup
 ===========================================================
 
+Namespace and Workgroup
+------------------------------
+
 A namespace is storage-related, with database objects and users.  A workspace is
 a collection of compute resources such as security groups and other properties and
 limitations.
 https://docs.aws.amazon.com/redshift/latest/mgmt/serverless-workgroup-namespace.html
 
+External Schema
+------------------------
+The command below creates an external schema, redshift_spectrum, and also creates a
+**new** external database "dev".  It appears in the console to be the same "dev"
+database that contains the public schema, but it is separate.  Also note the IAM role
+used to create the schema must match the role attached to the namespace::
+
+    CREATE external schema redshift_spectrum
+        FROM data catalog
+        DATABASE dev
+        IAM_ROLE 'arn:aws:iam::321942852011:role/bison_redshift_s3_role'
+        CREATE external database if NOT exists;
 
 EC2 instance creation
 ===========================================================

diff --git a/_sphinx_config/pages/aws/roles.rst b/_sphinx_config/pages/aws/roles.rst
@@ -13,15 +13,30 @@ bison_redshift_s3_role
 
 * Policies:
 
+  * AmazonRedshiftAllCommandsFullAccess (AWS managed)
+  * AmazonRedshiftDataFullAccess (AWS managed)
+  * AmazonRedshiftFullAccess (AWS managed)
   * bison_s3_policy (read public/GBIF S3 data and read/write bison S3 data)
   * redshift_glue_policy.json (for Redshift interactions)
-  * AmazonRedshiftAllCommandsFullAccess (AWS managed)
+
   * AmazonS3FullAccess (AWS managed)
 
 * Trust policy:
 
   *
 
+bison_redshift_lambda_role
+------------------------
+
+Attach to BISON lambda functions
+
+  * AmazonRedshiftAllCommandsFullAccess (AWS managed)
+  * AmazonRedshiftDataFullAccess (AWS managed)
+  * AmazonRedshiftFullAccess (AWS managed)
+  * bison_lambda_log_policy (write CloudWatch logs to bison log groups)
+    TODO: add new log group for each lambda function
+  * bison_s3_policy (read public/GBIF S3 data and read/write bison S3 data)
+
 .. _bison_ec2_s3_role:
 
 bison_ec2_s3_role

diff --git a/aws/events/bison_mount_gbif_lambda.py b/aws/events/bison_mount_gbif_lambda.py
@@ -1,6 +1,5 @@
 import json
 import boto3
-import botocore
 import botocore.session as bc
 from botocore.client import Config
 from datetime import datetime
@@ -13,6 +12,8 @@
 database = "dev"
 dbuser = "IAM:aimee.stewart"
 dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux"
+pub_schema = "public"
+external_schema = "redshift_spectrum"
 timeout = 900
 waittime = 5
 
@@ -21,8 +22,6 @@
 gbif_datestr = f"{datetime.now().year}-{datetime.now().month:02d}-01"
 parquet_key = f"occurrence/{gbif_datestr}/occurrence.parquet"
 bison_datestr = gbif_datestr.replace("-", "_")
-pub_schema = "public"
-external_schema = "redshift_spectrum"
 
 gbif_odr_data = f"s3://{gbif_bucket}/{parquet_key}/"
 mounted_gbif_name = f"{external_schema}.occurrence_{bison_datestr}_parquet"
@@ -103,20 +102,6 @@ class	VARCHAR(max),
             ('HUMAN_OBSERVATION', 'OBSERVATION', 'OCCURRENCE', 'PRESERVED_SPECIMEN');
 """
 
-list_external_tables_stmt = f"""
-    SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
-    FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
-    WHERE cls.relnamespace = ns.oid
-      AND schemaname = '{external_schema}';
-"""
-
-list_public_tables_stmt = f"""
-    SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
-    FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
-    WHERE cls.relnamespace = ns.oid
-      AND schemaname = '{pub_schema}';
-"""
-
 count_gbif_stmt = f"SELECT COUNT(*) from {mounted_gbif_name};"
 count_bison_stmt = f"SELECT COUNT(*) FROM {subset_bison_name};"
 unmount_stmt = f"DROP TABLE {mounted_gbif_name};"
@@ -145,8 +130,8 @@ def lambda_handler(event, context):
     # Submit query request
     try:
         submit_result = client_redshift.execute_statement(
-            WorkgroupName=workgroup, Database=database, Sql=list_public_tables_stmt)
-        print(f"*** {mounted_gbif_name} mount successfully executed")
+            WorkgroupName=workgroup, Database=database, Sql=mount_stmt)
+        print(f"*** Mount command submitted")
 
     except Exception as e:
         raise Exception(e)
@@ -157,7 +142,7 @@ def lambda_handler(event, context):
         print(f"***     {k} = {v}")
 
     # -------------------------------------
-    # Loop til complete
+    # Loop til complete, then describe result
     elapsed_time = 0
     complete = False
     while not complete and elapsed_time < 300:
@@ -178,20 +163,20 @@ def lambda_handler(event, context):
             print(f"Failed to describe_statement {e}")
             complete = True
 
-    # # -------------------------------------
-    # # Get statement output for query
-    try:
-        stmt_result = client_redshift.get_statement_result(Id=submit_id)
-    except Exception as e:
-        print(f"Failed to get_statement_result {e}")
-    else:
-        print("*** statement_result records")
-        try:
-            records = stmt_result["Records"]
-            for rec in records:
-                print(f"***     {rec}")
-        except Exception as e:
-            print(f"Failed to return records ({e})")
+    # # # -------------------------------------
+    # # # IFF query, get statement output
+    # try:
+    #     stmt_result = client_redshift.get_statement_result(Id=submit_id)
+    # except Exception as e:
+    #     print(f"*** No get_statement_result {e}")
+    # else:
+    #     print("*** get_statement_result records")
+    #     try:
+    #         records = stmt_result["Records"]
+    #         for rec in records:
+    #             print(f"***     {rec}")
+    #     except Exception as e:
+    #         print(f"Failed to return records ({e})")
 
     return {
         'statusCode': 200,

diff --git a/aws/events/bison_query_lambda.py b/aws/events/bison_query_lambda.py
@@ -0,0 +1,119 @@
+import json
+import boto3
+import botocore.session as bc
+from botocore.client import Config
+from datetime import datetime
+import time
+
+print('Loading function')
+
+region = "us-east-1"
+workgroup = "bison"
+database = "dev"
+dbuser = "IAM:aimee.stewart"
+dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux"
+dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux"
+bison_bucket = 'bison-321942852011-us-east-1'
+timeout = 900
+waittime = 2
+
+# Define the public bucket and file to query
+gbif_bucket = f"gbif-open-data-{region}"
+gbif_datestr = f"{datetime.now().year}-{datetime.now().month:02d}-01"
+parquet_key = f"occurrence/{gbif_datestr}/occurrence.parquet"
+bison_datestr = gbif_datestr.replace("-", "_")
+pub_schema = "public"
+external_schema = "redshift_spectrum"
+
+gbif_odr_data = f"s3://{gbif_bucket}/{parquet_key}/"
+mounted_gbif_name = f"{external_schema}.occurrence_{bison_datestr}_parquet"
+subset_bison_name = f"{pub_schema}.bison_{bison_datestr}"
+
+list_external_tables_stmt = f"""
+    SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
+    FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
+    WHERE cls.relnamespace = ns.oid
+      AND schemaname = '{external_schema}';
+"""
+
+list_public_tables_stmt = f"""
+    SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime
+    FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid
+    WHERE cls.relnamespace = ns.oid
+      AND schemaname = '{pub_schema}';
+"""
+
+count_gbif_stmt = f"SELECT COUNT(*) from {mounted_gbif_name};"
+count_bison_stmt = f"SELECT COUNT(*) FROM {subset_bison_name};"
+
+# Initializing Botocore client
+session = boto3.session.Session()
+bc_session = bc.get_session()
+session = boto3.Session(
+    botocore_session=bc_session,
+    region_name=region
+)
+
+# Initializing Redshift's client
+config = Config(connect_timeout=timeout, read_timeout=timeout)
+client_redshift = session.client("redshift-data", config=config)
+
+# -----------------------------------------------------
+def lambda_handler(event, context):
+    print("*** Entered lambda_handler")
+    # -------------------------------------
+    # Submit query request
+    try:
+        submit_result = client_redshift.execute_statement(
+            WorkgroupName=workgroup, Database=database, Sql=list_public_tables_stmt)
+        print(f"*** Mount command submitted")
+
+    except Exception as e:
+        raise Exception(e)
+
+    submit_id = submit_result['Id']
+    print(f"*** submit id = {submit_id}")
+    for k, v in submit_result.items():
+        print(f"***     {k} = {v}")
+
+    # -------------------------------------
+    # Loop til complete, then describe result
+    elapsed_time = 0
+    complete = False
+    while not complete and elapsed_time < 300:
+        try:
+            describe_result = client_redshift.describe_statement(Id=submit_id)
+            status = describe_result["Status"]
+            print(f"*** Query Status - {status} after {elapsed_time} seconds")
+            if status in ("ABORTED", "FAILED", "FINISHED"):
+                complete = True
+                desc_id = describe_result['Id']
+                print(f"*** desc id = {desc_id}")
+                for k, v in describe_result.items():
+                    print(f"***    {k} = {v}")
+            else:
+                time.sleep(waittime)
+                elapsed_time += waittime
+        except Exception as e:
+            print(f"Failed to describe_statement {e}")
+            complete = True
+
+    # -------------------------------------
+    # IFF query, get statement output
+    try:
+        stmt_result = client_redshift.get_statement_result(Id=submit_id)
+    except Exception as e:
+        print(f"*** No get_statement_result {e}")
+    else:
+        print("*** get_statement_result records")
+        try:
+            records = stmt_result["Records"]
+            for rec in records:
+                print(f"***     {rec}")
+        except Exception as e:
+            print(f"Failed to return records ({e})")
+
+    return {
+        'statusCode': 200,
+        'body': json.dumps(f"Lambda result logged")
+    }