From 50593d42aecfcbabcf5b40ab4fbc899f95dc36d5 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 12 Sep 2024 16:01:45 -0500 Subject: [PATCH] testing automation; unfinished --- _sphinx_config/pages/aws/aws_setup.rst | 21 ++++ _sphinx_config/pages/aws/roles.rst | 17 ++- aws/events/bison_mount_gbif_lambda.py | 53 +++------ aws/events/bison_query_lambda.py | 119 +++++++++++++++++++ aws/events/bison_subset_gbif_lambda.py | 158 ++++++------------------- aws/redshift/load_ancillary_data.sql | 10 +- aws/redshift/queries.sql | 21 ++++ aws/redshift/subset_to_bison.sql | 39 +++--- 8 files changed, 255 insertions(+), 183 deletions(-) create mode 100644 aws/events/bison_query_lambda.py create mode 100644 aws/redshift/queries.sql diff --git a/_sphinx_config/pages/aws/aws_setup.rst b/_sphinx_config/pages/aws/aws_setup.rst index c286d3d1..986cc305 100644 --- a/_sphinx_config/pages/aws/aws_setup.rst +++ b/_sphinx_config/pages/aws/aws_setup.rst @@ -13,14 +13,35 @@ The :ref:`_bison_redshift_s3_role` allows Redshift to access public S3 data and the bison S3 bucket, and allows Redshift to perform glue functions. Its trust relationship grants AssumeRole to redshift service. +Make sure that the same role granted to the namespace is used for creating an external +schema and lambda functions. When mounting external data as a redshift table to the +external schema, you may encounter an error indicating that the "dev" database does not +exist. This refers to the external database, and may indicate that the role used by the +command and/or namespace differs from the role granted to the schema upon creation. + Redshift Namespace and Workgroup =========================================================== +Namespace and Workgroup +------------------------------ + A namespace is storage-related, with database objects and users. A workspace is a collection of compute resources such as security groups and other properties and limitations. https://docs.aws.amazon.com/redshift/latest/mgmt/serverless-workgroup-namespace.html +External Schema +------------------------ +The command below creates an external schema, redshift_spectrum, and also creates a +**new** external database "dev". It appears in the console to be the same "dev" +database that contains the public schema, but it is separate. Also note the IAM role +used to create the schema must match the role attached to the namespace:: + + CREATE external schema redshift_spectrum + FROM data catalog + DATABASE dev + IAM_ROLE 'arn:aws:iam::321942852011:role/bison_redshift_s3_role' + CREATE external database if NOT exists; EC2 instance creation =========================================================== diff --git a/_sphinx_config/pages/aws/roles.rst b/_sphinx_config/pages/aws/roles.rst index 9a088892..3308d989 100644 --- a/_sphinx_config/pages/aws/roles.rst +++ b/_sphinx_config/pages/aws/roles.rst @@ -13,15 +13,30 @@ bison_redshift_s3_role * Policies: + * AmazonRedshiftAllCommandsFullAccess (AWS managed) + * AmazonRedshiftDataFullAccess (AWS managed) + * AmazonRedshiftFullAccess (AWS managed) * bison_s3_policy (read public/GBIF S3 data and read/write bison S3 data) * redshift_glue_policy.json (for Redshift interactions) - * AmazonRedshiftAllCommandsFullAccess (AWS managed) + * AmazonS3FullAccess (AWS managed) * Trust policy: * +bison_redshift_lambda_role +------------------------ + +Attach to BISON lambda functions + + * AmazonRedshiftAllCommandsFullAccess (AWS managed) + * AmazonRedshiftDataFullAccess (AWS managed) + * AmazonRedshiftFullAccess (AWS managed) + * bison_lambda_log_policy (write CloudWatch logs to bison log groups) + TODO: add new log group for each lambda function + * bison_s3_policy (read public/GBIF S3 data and read/write bison S3 data) + .. _bison_ec2_s3_role: bison_ec2_s3_role diff --git a/aws/events/bison_mount_gbif_lambda.py b/aws/events/bison_mount_gbif_lambda.py index e2f0c200..1bd673d4 100644 --- a/aws/events/bison_mount_gbif_lambda.py +++ b/aws/events/bison_mount_gbif_lambda.py @@ -1,6 +1,5 @@ import json import boto3 -import botocore import botocore.session as bc from botocore.client import Config from datetime import datetime @@ -13,6 +12,8 @@ database = "dev" dbuser = "IAM:aimee.stewart" dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux" +pub_schema = "public" +external_schema = "redshift_spectrum" timeout = 900 waittime = 5 @@ -21,8 +22,6 @@ gbif_datestr = f"{datetime.now().year}-{datetime.now().month:02d}-01" parquet_key = f"occurrence/{gbif_datestr}/occurrence.parquet" bison_datestr = gbif_datestr.replace("-", "_") -pub_schema = "public" -external_schema = "redshift_spectrum" gbif_odr_data = f"s3://{gbif_bucket}/{parquet_key}/" mounted_gbif_name = f"{external_schema}.occurrence_{bison_datestr}_parquet" @@ -103,20 +102,6 @@ class VARCHAR(max), ('HUMAN_OBSERVATION', 'OBSERVATION', 'OCCURRENCE', 'PRESERVED_SPECIMEN'); """ -list_external_tables_stmt = f""" - SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime - FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid - WHERE cls.relnamespace = ns.oid - AND schemaname = '{external_schema}'; -""" - -list_public_tables_stmt = f""" - SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime - FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid - WHERE cls.relnamespace = ns.oid - AND schemaname = '{pub_schema}'; -""" - count_gbif_stmt = f"SELECT COUNT(*) from {mounted_gbif_name};" count_bison_stmt = f"SELECT COUNT(*) FROM {subset_bison_name};" unmount_stmt = f"DROP TABLE {mounted_gbif_name};" @@ -145,8 +130,8 @@ def lambda_handler(event, context): # Submit query request try: submit_result = client_redshift.execute_statement( - WorkgroupName=workgroup, Database=database, Sql=list_public_tables_stmt) - print(f"*** {mounted_gbif_name} mount successfully executed") + WorkgroupName=workgroup, Database=database, Sql=mount_stmt) + print(f"*** Mount command submitted") except Exception as e: raise Exception(e) @@ -157,7 +142,7 @@ def lambda_handler(event, context): print(f"*** {k} = {v}") # ------------------------------------- - # Loop til complete + # Loop til complete, then describe result elapsed_time = 0 complete = False while not complete and elapsed_time < 300: @@ -178,20 +163,20 @@ def lambda_handler(event, context): print(f"Failed to describe_statement {e}") complete = True - # # ------------------------------------- - # # Get statement output for query - try: - stmt_result = client_redshift.get_statement_result(Id=submit_id) - except Exception as e: - print(f"Failed to get_statement_result {e}") - else: - print("*** statement_result records") - try: - records = stmt_result["Records"] - for rec in records: - print(f"*** {rec}") - except Exception as e: - print(f"Failed to return records ({e})") + # # # ------------------------------------- + # # # IFF query, get statement output + # try: + # stmt_result = client_redshift.get_statement_result(Id=submit_id) + # except Exception as e: + # print(f"*** No get_statement_result {e}") + # else: + # print("*** get_statement_result records") + # try: + # records = stmt_result["Records"] + # for rec in records: + # print(f"*** {rec}") + # except Exception as e: + # print(f"Failed to return records ({e})") return { 'statusCode': 200, diff --git a/aws/events/bison_query_lambda.py b/aws/events/bison_query_lambda.py new file mode 100644 index 00000000..bf465662 --- /dev/null +++ b/aws/events/bison_query_lambda.py @@ -0,0 +1,119 @@ +import json +import boto3 +import botocore.session as bc +from botocore.client import Config +from datetime import datetime +import time + +print('Loading function') + +region = "us-east-1" +workgroup = "bison" +database = "dev" +dbuser = "IAM:aimee.stewart" +dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux" +dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux" +bison_bucket = 'bison-321942852011-us-east-1' +timeout = 900 +waittime = 2 + +# Define the public bucket and file to query +gbif_bucket = f"gbif-open-data-{region}" +gbif_datestr = f"{datetime.now().year}-{datetime.now().month:02d}-01" +parquet_key = f"occurrence/{gbif_datestr}/occurrence.parquet" +bison_datestr = gbif_datestr.replace("-", "_") +pub_schema = "public" +external_schema = "redshift_spectrum" + +gbif_odr_data = f"s3://{gbif_bucket}/{parquet_key}/" +mounted_gbif_name = f"{external_schema}.occurrence_{bison_datestr}_parquet" +subset_bison_name = f"{pub_schema}.bison_{bison_datestr}" + +list_external_tables_stmt = f""" + SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime + FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid + WHERE cls.relnamespace = ns.oid + AND schemaname = '{external_schema}'; +""" + +list_public_tables_stmt = f""" + SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime + FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid + WHERE cls.relnamespace = ns.oid + AND schemaname = '{pub_schema}'; +""" + +count_gbif_stmt = f"SELECT COUNT(*) from {mounted_gbif_name};" +count_bison_stmt = f"SELECT COUNT(*) FROM {subset_bison_name};" + +# Initializing Botocore client +session = boto3.session.Session() +bc_session = bc.get_session() +session = boto3.Session( + botocore_session=bc_session, + region_name=region +) + +# Initializing Redshift's client +config = Config(connect_timeout=timeout, read_timeout=timeout) +client_redshift = session.client("redshift-data", config=config) + +# ----------------------------------------------------- +def lambda_handler(event, context): + print("*** Entered lambda_handler") + # ------------------------------------- + # Submit query request + try: + submit_result = client_redshift.execute_statement( + WorkgroupName=workgroup, Database=database, Sql=list_public_tables_stmt) + print(f"*** Mount command submitted") + + except Exception as e: + raise Exception(e) + + submit_id = submit_result['Id'] + print(f"*** submit id = {submit_id}") + for k, v in submit_result.items(): + print(f"*** {k} = {v}") + + # ------------------------------------- + # Loop til complete, then describe result + elapsed_time = 0 + complete = False + while not complete and elapsed_time < 300: + try: + describe_result = client_redshift.describe_statement(Id=submit_id) + status = describe_result["Status"] + print(f"*** Query Status - {status} after {elapsed_time} seconds") + if status in ("ABORTED", "FAILED", "FINISHED"): + complete = True + desc_id = describe_result['Id'] + print(f"*** desc id = {desc_id}") + for k, v in describe_result.items(): + print(f"*** {k} = {v}") + else: + time.sleep(waittime) + elapsed_time += waittime + except Exception as e: + print(f"Failed to describe_statement {e}") + complete = True + + # ------------------------------------- + # IFF query, get statement output + try: + stmt_result = client_redshift.get_statement_result(Id=submit_id) + except Exception as e: + print(f"*** No get_statement_result {e}") + else: + print("*** get_statement_result records") + try: + records = stmt_result["Records"] + for rec in records: + print(f"*** {rec}") + except Exception as e: + print(f"Failed to return records ({e})") + + return { + 'statusCode': 200, + 'body': json.dumps(f"Lambda result logged") + } diff --git a/aws/events/bison_subset_gbif_lambda.py b/aws/events/bison_subset_gbif_lambda.py index 85106bac..10c68f65 100644 --- a/aws/events/bison_subset_gbif_lambda.py +++ b/aws/events/bison_subset_gbif_lambda.py @@ -1,10 +1,8 @@ -import os -import json import boto3 -import botocore import botocore.session as bc from botocore.client import Config from datetime import datetime +import time print('Loading function') @@ -12,16 +10,18 @@ workgroup = "bison" database = "dev" dbuser = "IAM:aimee.stewart" -dbuser = "arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux" +dbuser = "arn:aws:iam::321942852011:role/service-role/bison_redshift_lambda_role" timeout = 900 +waittime = 2 +bison_bucket = 'bison-321942852011-us-east-1' +pub_schema = "public" +external_schema = "redshift_spectrum" # Define the public bucket and file to query gbif_bucket = f"gbif-open-data-{region}" gbif_datestr = f"{datetime.now().year}-{datetime.now().month:02d}-01" parquet_key = f"occurrence/{gbif_datestr}/occurrence.parquet" bison_datestr = gbif_datestr.replace("-", "_") -pub_schema = "public" -external_schema = "redshift_spectrum" gbif_odr_data = f"s3://{gbif_bucket}/{parquet_key}/" mounted_gbif_name = f"{external_schema}.occurrence_{bison_datestr}_parquet" @@ -102,30 +102,11 @@ class VARCHAR(max), ('HUMAN_OBSERVATION', 'OBSERVATION', 'OCCURRENCE', 'PRESERVED_SPECIMEN'); """ - -list_external_tables_stmt = f""" - SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime - FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid - WHERE cls.relnamespace = ns.oid - AND schemaname = '{external_schema}'; -""" - -list_public_tables_stmt = f""" - SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime - FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid - WHERE cls.relnamespace = ns.oid - AND schemaname = '{pub_schema}'; -""" - count_gbif_stmt = f"SELECT COUNT(*) from {mounted_gbif_name};" count_bison_stmt = f"SELECT COUNT(*) FROM {subset_bison_name};" unmount_stmt = f"DROP TABLE {mounted_gbif_name};" -bison_bucket = 'bison-321942852011-us-east-1' -test_fname = 'bison_trigger_success.txt' -test_content = 'Success = True' session = boto3.session.Session() -region = "us-east-1" # Initializing Botocore client bc_session = bc.get_session() @@ -144,112 +125,41 @@ def lambda_handler(event, context): # ------------------------------------- # Mount GBIF data try: - mount_response = client_redshift.execute_statement( + submit_result = client_redshift.execute_statement( WorkgroupName=workgroup, Database=database, Sql=mount_stmt) - print(f"*** {mounted_gbif_name} mount successfully executed") - - except botocore.exceptions.ConnectionError as e: - client_redshift_1 = session.client("redshift-data", config=config) - mount_response = client_redshift_1.batch_execute_statement( - WorkgroupName=workgroup, Database=database, DbUser=dbuser, Sql=mount_stmt) - print(f"*** {mounted_gbif_name} mount after reestablishing the connection") + print("*** Mount submitted") except Exception as e: raise Exception(e) - print(str(mount_response)) - curr_id = mount_response['Id'] + curr_id = submit_result['Id'] print(f"*** id = {curr_id}") describe_response = client_redshift.describe_statement(Id=curr_id) print(str(describe_response)) - # # ------------------------------------- - # # Mount GBIF data - # try: - # mount_response = client_redshift.execute_statement( - # WorkgroupName=workgroup, Database=database, Sql=mount_stmt) - # print(f"*** {mounted_gbif_name} mount successfully executed") - # - # except botocore.exceptions.ConnectionError as e: - # client_redshift_1 = session.client("redshift-data", config=config) - # mount_response = client_redshift_1.execute_statement( - # WorkgroupName=workgroup, Database=database, DbUser=dbuser, Sql=mount_stmt) - # print(f"*** {mounted_gbif_name} mount after reestablishing the connection") - # - # except Exception as e: - # raise Exception(e) - # - # print(str(mount_response)) - # curr_id = mount_response['Id'] - # print(f"*** id = {curr_id}") - # describe_response = client_redshift.describe_statement(Id=curr_id) - # print(str(describe_response)) - # # ------------------------------------- - # # Wait for success - # - # # ------------------------------------- - # # Count GBIF records - # try: - # count_gbif_response = client_redshift.execute_statement( - # WorkgroupName=workgroup, Database=database, Sql=count_gbif_stmt) - # print("*** GBIF count successfully executed") - # - # except botocore.exceptions.ConnectionError as e: - # client_redshift_1 = session.client("redshift-data", config=config) - # count_gbif_response = client_redshift_1.execute_statement( - # WorkgroupName=workgroup, Database=database, DbUser=dbuser, Sql=mount_stmt) - # print("*** GBIF count after reestablishing the connection") - # - # except Exception as e: - # raise Exception(e) - # - # print(str(count_gbif_response)) - # curr_id = count_gbif_response['Id'] - # describe_response = client_redshift.describe_statement(Id=curr_id) - # print(f"*** id = {curr_id}") - # print(str(describe_response)) - # # ------------------------------------- - # # Subset GBIF data for BISON - # try: - # subset_response = client_redshift.execute_statement( - # WorkgroupName=workgroup, Database=database, Sql=subset_stmt) - # print(f"*** Subset to {subset_bison_name} successfully executed") - # - # except botocore.exceptions.ConnectionError as e: - # client_redshift_1 = session.client("redshift-data", config=config) - # subset_response = client_redshift_1.execute_statement( - # WorkgroupName=workgroup, Database=database, DbUser=dbuser, Sql=subset_stmt) - # print(f"*** Subset to {subset_bison_name} after reestablishing the connection") - # - # except Exception as e: - # raise Exception(e) - # - # print(f"*** id = {subset_response['Id']}") - # print(str(subset_response)) - # # ------------------------------------- - # # Count BISON records - # try: - # count_bison_response = client_redshift.execute_statement( - # WorkgroupName=workgroup, Database=database, DbUser=dbuser, Sql=count_bison_stmt) - # print("*** BISON count successfully executed") - # - # except botocore.exceptions.ConnectionError as e: - # client_redshift_1 = session.client("redshift-data", config=config) - # count_bison_response = client_redshift_1.execute_statement( - # WorkgroupName=workgroup, Database=database, Sql=count_bison_stmt) - # print("*** BISON count after reestablishing the connection") - # - # except Exception as e: - # raise Exception(e) - - print(f"*** id = {count_bison_response['Id']}") - print(str(count_bison_response)) + submit_id = submit_result['Id'] + print(f"*** submit id = {submit_id}") + for k, v in submit_result.items(): + print(f"*** {k} = {v}") + # ------------------------------------- - # Place test file in bucket to indicate success - s3 = boto3.client('s3', region_name=region) - s3.put_object(Body=test_content, Bucket=bison_bucket, Key=test_fname) - - return { - 'statusCode': 200, - 'body': json.dumps(f"Lambda result: {str(mount_response)}") - } + # Loop til complete, then describe result + elapsed_time = 0 + complete = False + while not complete and elapsed_time < 300: + try: + describe_result = client_redshift.describe_statement(Id=submit_id) + status = describe_result["Status"] + print(f"*** Query Status - {status} after {elapsed_time} seconds") + if status in ("ABORTED", "FAILED", "FINISHED"): + complete = True + desc_id = describe_result['Id'] + print(f"*** desc id = {desc_id}") + for k, v in describe_result.items(): + print(f"*** {k} = {v}") + else: + time.sleep(waittime) + elapsed_time += waittime + except Exception as e: + print(f"Failed to describe_statement {e}") + complete = True diff --git a/aws/redshift/load_ancillary_data.sql b/aws/redshift/load_ancillary_data.sql index e8028c7c..e5834130 100644 --- a/aws/redshift/load_ancillary_data.sql +++ b/aws/redshift/load_ancillary_data.sql @@ -47,7 +47,7 @@ CREATE TABLE riisv2_2024_08_01 ( ); COPY riisv2_2024_08_01 -FROM 's3://bison-321942852011-us-east-1/input_data/USRIISv2_MasterList_annotated_2024_08_01.csv' +FROM 's3://bison-321942852011-us-east-1/input/USRIISv2_MasterList_annotated_2024_08_01.csv' FORMAT CSV IAM_role DEFAULT; @@ -75,7 +75,7 @@ CREATE TABLE aiannh2023 ( ); COPY aiannh2023 -FROM 's3://bison-321942852011-us-east-1/input_data/cb_2023_us_aiannh_500k.shp' +FROM 's3://bison-321942852011-us-east-1/input/cb_2023_us_aiannh_500k.shp' FORMAT SHAPEFILE IAM_role DEFAULT; @@ -103,7 +103,7 @@ CREATE TABLE county2023 ( ); COPY county2023 -FROM 's3://bison-321942852011-us-east-1/input_data/cb_2023_us_county_500k.shp' +FROM 's3://bison-321942852011-us-east-1/input/cb_2023_us_county_500k.shp' FORMAT SHAPEFILE IAM_role DEFAULT; @@ -126,13 +126,11 @@ CREATE TABLE pad1 ( GIS_Acres VARCHAR(max) ); -COPY pad1 FROM 's3://bison-321942852011-us-east-1/input_data/pad_4.0_gap1_4326.shp' +COPY pad1 FROM 's3://bison-321942852011-us-east-1/input/pad_4.0_gap1_4326.shp' FORMAT SHAPEFILE SIMPLIFY AUTO IAM_role DEFAULT; -SELECT query_id, start_time, line_number, column_name, column_type, error_message - FROM sys_load_error_detail ORDER BY start_time DESC; ---- ------------------------------------------------------------------------------------- ---- ------------------------------------------------------------------------------------- diff --git a/aws/redshift/queries.sql b/aws/redshift/queries.sql new file mode 100644 index 00000000..78386319 --- /dev/null +++ b/aws/redshift/queries.sql @@ -0,0 +1,21 @@ +-- Get last error message +SELECT query_id, start_time, line_number, column_name, column_type, error_message + FROM sys_load_error_detail ORDER BY start_time DESC; + +-- Count post-data load +SELECT COUNT(*) from dev.redshift_spectrum.occurrence_2024_09_01_parquet; +SELECT COUNT(*) FROM public.bison_2024_09_01; + +-- List Redshift tables and creation times +SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime +FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid +WHERE cls.relnamespace = ns.oid + AND schemaname = 'public'; + + +SELECT * FROM svv_all_schemas WHERE database_name = 'dev' +ORDER BY database_name, SCHEMA_NAME; + +select current_user; +SELECT * FROM PG_USER_INFO; + diff --git a/aws/redshift/subset_to_bison.sql b/aws/redshift/subset_to_bison.sql index 6799188b..009d12b8 100644 --- a/aws/redshift/subset_to_bison.sql +++ b/aws/redshift/subset_to_bison.sql @@ -1,3 +1,4 @@ + -- Mount S3 GBIF Open Data Registry as an external table, then subset it for BISON ------------------- @@ -10,14 +11,28 @@ -- Mount GBIF -- ------------------------------------------------------------------------------------- -- Create a schema for mounting external data --- Throws error if pre-existing -CREATE external schema redshift_spectrum +-- Throws error if pre-existing?? +-- This also creates a new external database "dev", though it appears in the console to +-- be the same "dev" database that contains the public schema. +DROP EXTERNAL SCHEMA redshift_spectrum; +CREATE EXTERNAL SCHEMA IF NOT EXISTS redshift_spectrum FROM data catalog - DATABASE dev - IAM_ROLE 'arn:aws:iam::321942852011:role/service-role/AmazonRedshift-CommandsAccessRole-20231129T105842' + DATABASE 'dev' + -- Same role as namespace + IAM_ROLE 'arn:aws:iam::321942852011:role/bison_redshift_s3_role' CREATE external database if NOT exists; +-- If change IAM role, do this: +--GRANT USAGE TO redshift_spectrum to "IAMR:bison_subset_gbif_lambda-role-9i5qvpux"; +GRANT ALL ON ALL TABLES IN SCHEMA redshift_spectrum + TO ROLE 'arn:aws:iam::321942852011:role/service-role/bison_subset_gbif_lambda-role-9i5qvpux'; + + + -- Mount a table of current GBIF ODR data in S3 +-- An error indicating that the "dev" database does not exist, refers to the external +-- database, and may indicate that the role used by the command and/or namespace +-- differs from the role granted to the schema upon creation. CREATE EXTERNAL TABLE redshift_spectrum.occurrence_2024_09_01_parquet ( gbifid VARCHAR(max), datasetkey VARCHAR(max), @@ -73,11 +88,12 @@ CREATE EXTERNAL TABLE redshift_spectrum.occurrence_2024_09_01_parquet ( STORED AS PARQUET LOCATION 's3://gbif-open-data-us-east-1/occurrence/2024-09-01/occurrence.parquet/'; + -- ------------------------------------------------------------------------------------- -- Subset for BISON -- ------------------------------------------------------------------------------------- -- Drop previous table; -DROP TABLE IF EXISTS public.bison_2024_07_01; +DROP TABLE IF EXISTS public.bison_2024_08_01; -- Create a BISON table with a subset of records and subset of fields -- TODO: This includes lat/lon, allowing final export to Parquet after deleting geom CREATE TABLE public.bison_2024_09_01 AS @@ -98,19 +114,6 @@ CREATE TABLE public.bison_2024_09_01 AS AND basisofrecord IN ('HUMAN_OBSERVATION', 'OBSERVATION', 'OCCURRENCE', 'PRESERVED_SPECIMEN'); --- ------------------------------------------------------------------------------------- --- Misc Queries --- ------------------------------------------------------------------------------------- --- Count records from full GBIF and BISON subset -SELECT COUNT(*) from dev.redshift_spectrum.occurrence_2024_09_01_parquet; -SELECT COUNT(*) FROM public.bison_2024_09_01; - --- List Redshift tables and creation times -SELECT reloid AS tableid, nspname as schemaname, relname as tablename, relcreationtime -FROM pg_class_info cls LEFT JOIN pg_namespace ns ON cls.relnamespace=ns.oid -WHERE cls.relnamespace = ns.oid - AND schemaname = 'public'; - -- ------------------------------------------------------------------------------------- -- Unmount original GBIF data -- -------------------------------------------------------------------------------------