GoogleCloudPlatform · karcot1 · Mar 25, 2024 · Mar 27, 2024 · Mar 29, 2024 · Apr 18, 2024
diff --git a/scripts/optimization/README.md b/scripts/optimization/README.md
@@ -5,18 +5,31 @@ named, `optimization_workshop`, with several tables inside the dataset. \
 These tables are populated with information to help you optimize your BigQuery
 tables, views, and queries.
 
-Run all the scripts within this folder using the following commands:
+Run all the .sql scripts within this folder using the following commands:
 
 ```bash
 gcloud auth login &&
 bash run_all_scripts.sh
 ```
 
-Run [Anti-pattern Recognition Tool](https://github.com/GoogleCloudPlatform/bigquery-antipattern-recognition/tree/main): 
+The `anti-pattern-recognittion-tool-scripts` subfolder contains additional scripts that need to be executed separetly to run 
+the [Anti-pattern Recognition Tool](https://github.com/GoogleCloudPlatform/bigquery-antipattern-recognition/tree/main): 
 ```bash
-bash run_anti_pattern_tool.sh
+bash ./anti-pattern-recognittion-tool-scripts/run_anti_pattern_tool.sh \
+--input_table_name="optimization_workshop.viewable_queries_grouped_by_hash" \
+--input_table_id_col_name="Query_Hash" \
+--input_table_query_text_col_name="Query_Raw_Sample" \
+--input_table_slots_col_name="Total_Slot_Hours"
+
+bash ./anti-pattern-recognittion-tool-scripts/run_anti_pattern_tool.sh \
+--input_table_name="optimization_workshop.queries_grouped_by_hash_project" \
+--input_table_id_col_name="query_hash" \
+--input_table_query_text_col_name="top_10_jobs[SAFE_OFFSET(0)].query_text" \
+--input_table_slots_col_name="avg_total_slots"
 ```
 
+Te above command tales the `<dataset>.<tablename>` as input. In can be executed on any table with a schema similar to the one generated by `viewable_queries_grouped_by_hash`.
+
 The scripts are described in more detail in the following sections.
 
 ---

diff --git a/...anti_pattern_recoginition_tool_tables.sql → ...anti_pattern_recoginition_tool_tables.sql b/...anti_pattern_recoginition_tool_tables.sql → ...anti_pattern_recoginition_tool_tables.sql
@@ -26,16 +26,16 @@ CREATE OR REPLACE TABLE optimization_workshop.antipattern_output_table (
 
 CREATE OR REPLACE VIEW optimization_workshop.antipattern_tool_input_view AS
 SELECT 
-  Query_Hash id, 
-  ANY_VALUE(Query_Raw_Sample) query,
+  <input_table_id_col_name> id, 
+  ANY_VALUE(<input_table_query_text_col_name>) query
 FROM 
-  optimization_workshop.viewable_queries_grouped_by_hash 
+  <input_table>
 WHERE
-  Query_Hash is not null
+  <input_table_id_col_name> is not null
 GROUP BY 
-  Query_Hash 
+  <input_table_id_col_name>
 ORDER BY 
-  ANY_VALUE(Total_Slot_Hours) desc
+  ANY_VALUE(<input_table_slots_col_name>) desc
 LIMIT 
   1000
 ;
diff --git a/...pts/optimization/run_anti_pattern_tool.sh → ...ion-tool-scripts/run_anti_pattern_tool.sh b/...pts/optimization/run_anti_pattern_tool.sh → ...ion-tool-scripts/run_anti_pattern_tool.sh
@@ -16,6 +16,35 @@
 
 # Exit immediately if a command exits with a non-zero status.
 set -e
+
+# Get input_table name as input 
+for i in "$@"; do
+  case $i in
+    --input_table_name=*)
+      input_table_name="${i#*=}"
+      shift # past argument=value
+      ;;
+    --input_table_id_col_name=*)
+      input_table_id_col_name="${i#*=}"
+      shift # past argument=value
+      ;;
+    --input_table_query_text_col_name=*)
+      input_table_query_text_col_name="${i#*=}"
+      shift # past argument=value
+      ;;
+    --input_table_slots_col_name=*)
+      input_table_slots_col_name="${i#*=}"
+      shift # past argument=value
+      ;;
+    -*|--*)
+      echo "Unknown option $i"
+      exit 1
+      ;;
+    *)
+      ;;
+  esac
+done
+
 # Set the following flags for the bq command:
 #   --quiet: suppress status updates while jobs are running
 #   --nouse_legacy_sql: use standard SQL syntax
@@ -24,7 +53,13 @@ bq_flags="--quiet --nouse_legacy_sql --nouse_cache"
 
 
 # Run setup for anti pattern recognition tool
-bq query ${bq_flags} <anti_pattern_recoginition_tool_tables.sql
+anti_pattern_recoginition_tool_tables_sql=$(sed -e "s/<input_table>/$input_table_name/g" \
+                                                -e "s/<input_table_id_col_name>/$input_table_id_col_name/g" \
+                                                -e "s/<input_table_query_text_col_name>/$input_table_query_text_col_name/g" \
+                                                -e "s/<input_table_slots_col_name>/$input_table_slots_col_name/g" \
+                                                "./anti-pattern-recognittion-tool-scripts/anti_pattern_recoginition_tool_tables.sql")
+
+bq query ${bq_flags} <<< "$anti_pattern_recoginition_tool_tables_sql"
 
 { # try
 
@@ -38,8 +73,11 @@ bq query ${bq_flags} <anti_pattern_recoginition_tool_tables.sql
   --input_bq_table ${PROJECT_ID}.optimization_workshop.antipattern_tool_input_view \
   --output_table ${PROJECT_ID}.optimization_workshop.antipattern_output_table
 
-  # write anti pattern output to queries by has table
-  bq query ${bq_flags} <update_queries_by_hash_w_anti_patterns.sql
+    # write anti pattern output to queries by has table
+  update_queries_by_hash_w_anti_patterns_sql=$(sed -e "s/<input_table>/$input_table_name/g" \
+                                                   -e "s/<input_table_id_col_name>/$input_table_id_col_name/g" \
+                                                   "./anti-pattern-recognittion-tool-scripts/update_queries_by_hash_w_anti_patterns.sql")
+  bq query ${bq_flags} <<< "$update_queries_by_hash_w_anti_patterns_sql"
 
 } || { # catch
     echo 'Error: could not run Anti-pattern Recognition Tool. Try using GCP Cloud Shell https://cloud.google.com/shell/docs/launching-cloud-shell'

diff --git a/...pdate_queries_by_hash_w_anti_patterns.sql → ...pdate_queries_by_hash_w_anti_patterns.sql b/...pdate_queries_by_hash_w_anti_patterns.sql → ...pdate_queries_by_hash_w_anti_patterns.sql
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-ALTER TABLE optimization_workshop.viewable_queries_grouped_by_hash
+ALTER TABLE <input_table>
 ADD COLUMN IF NOT EXISTS recommendation ARRAY<STRUCT<name STRING, description STRING>>;
 
-UPDATE optimization_workshop.viewable_queries_grouped_by_hash t1
+UPDATE <input_table> t1
 SET t1.recommendation = t2.recommendation
 FROM optimization_workshop.antipattern_output_table t2
-WHERE t1.Query_Hash = t2.job_id;
+WHERE t1.<input_table_id_col_name> = t2.job_id;
diff --git a/.../optimization/queries_grouped_by_hash.sql → ...imization/queries_grouped_by_hash_org.sql b/.../optimization/queries_grouped_by_hash.sql → ...imization/queries_grouped_by_hash_org.sql
@@ -41,7 +41,7 @@ CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS (
 );
 
 CREATE SCHEMA IF NOT EXISTS optimization_workshop;
-CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash AS
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash_org AS
 SELECT
   statement_type,
   query_info.query_hashes.normalized_literals                              AS query_hash,
@@ -53,8 +53,7 @@ SELECT
   ARRAY_AGG(
     STRUCT(
       bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
-      bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
-      query as query_text
+      bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url
     )
     ORDER BY total_slot_ms
     DESC LIMIT 10)                                                         AS top_10_jobs,

diff --git a/scripts/optimization/queries_grouped_by_hash_project.sql b/scripts/optimization/queries_grouped_by_hash_project.sql
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, top_bytes_scanning_queries_by_hash, 
+ * which contains the top 200 most expensive queries by total bytes scanned
+ * within the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ * Queries are grouped by their normalized query pattern, which ignores
+ * comments, parameter values, UDFs, and literals in the query text.
+ * This allows us to group queries that are logically the same, but
+ * have different literals. 
+ * 
+ * For example, the following queries would be grouped together:
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01'
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02'
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03'
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS (
+  COALESCE((
+    SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0)) 
+    FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0)
+  + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0)
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash_project AS
+SELECT
+  statement_type,
+  query_info.query_hashes.normalized_literals                              AS query_hash,
+  COUNT(DISTINCT DATE(start_time))                                         AS days_active,
+  ARRAY_AGG(DISTINCT project_id IGNORE NULLS)                              AS project_ids,
+  ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS)                          AS reservation_ids,
+  SUM(num_stages_with_perf_insights(query_info))                           AS num_stages_with_perf_insights,
+  COUNT(DISTINCT (project_id || ':us.' || job_id))                         AS job_count,
+  ARRAY_AGG(
+    STRUCT(
+      bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
+      bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
+      query as query_text
+    )
+    ORDER BY total_slot_ms
+    DESC LIMIT 10)                                                         AS top_10_jobs,
+  ARRAY_AGG(DISTINCT user_email)                                           AS user_emails,
+  SUM(total_bytes_processed) / POW(1024, 3)                                AS total_gigabytes_processed,
+  AVG(total_bytes_processed) / POW(1024, 3)                                AS avg_gigabytes_processed,
+  SUM(total_slot_ms) / (1000 * 60 * 60)                                    AS total_slot_hours,
+  AVG(total_slot_ms) / (1000 * 60 * 60)                                    AS avg_total_slot_hours_per_active_day,
+  AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) )                       AS avg_job_duration_seconds,
+  ARRAY_AGG(DISTINCT FORMAT("%T",labels))                                  AS labels,
+  SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS total_slots,
+  AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS avg_total_slots,
+  -- query hashes will all have the same referenced tables so we can use ANY_VALUE below
+  ANY_VALUE(ARRAY(
+    SELECT 
+      ref_table.project_id || '.' || 
+      IF(STARTS_WITH(ref_table.dataset_id, '_'), 'TEMP', ref_table.dataset_id)
+      || '.' || ref_table.table_id
+    FROM UNNEST(referenced_tables) ref_table
+  ))                                                                       AS referenced_tables,
+FROM `region-us`.INFORMATION_SCHEMA.JOBS
+WHERE 
+  DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+  AND state = 'DONE'
+  AND error_result IS NULL
+  AND job_type = 'QUERY'
+  AND statement_type != 'SCRIPT' 
+GROUP BY statement_type, query_hash;
diff --git a/scripts/policy_tag_extractor/README.md b/scripts/policy_tag_extractor/README.md
@@ -0,0 +1,24 @@
+# BigQuery Policy Tag Extractor
+
+## Introduction
+This directory contains the [policy_tag_export.sh](policy_tag_export.sh) bash script which extracts BigQuery policy tag information from a given dataset. The script will iterate through at most 10K tables in a dataset and then for every column with a policy tag, it will output the table name, column name, and policy tag ID in CSV format.
+
+## Instructions for use
+The simplest way to execute this script is to run it directly in Cloud Shell, but if needed it can be executed as part of a larger CI/CD pipeline or process.
+
+Before using, make sure to update the bash script with the dataset that needs to be reviewed.
+
+To exceute in Cloud Shell:
+1. [Launch a Cloud Shell session](https://cloud.google.com/shell/docs/launching-cloud-shell) in the GCP project where your BigQuery data resides. 
+  * When Cloud Shell is started, the active project in Cloud Shell is propagated to your gcloud configuration inside Cloud Shell for immediate use. GOOGLE_CLOUD_PROJECT, the environmental variable used by Application Default Credentials library, is also set to point to the active project in Cloud Shell. You can also explicitly set the project using `gcloud config set project [PROJECT_ID]`.
+1. [Upload](https://cloud.google.com/shell/docs/uploading-and-downloading-files#upload_and_download_files_and_folders) the policy_tag_export.sh script to the Cloud Shell environment.
+1. Execute the script by running `bash policy_tag_export.sh`.
+1. List the resources in Cloud Shell (ls) and verify that a file called "policy_tags.csv" was created.
+1. [Download](https://cloud.google.com/shell/docs/uploading-and-downloading-files#upload_and_download_files_and_folders) the file.
+
+## Considerations
+* Ensure either you or the service account executing the bash script has the bigquery.metadataViewer role to access the required level of information.
+* Currently, the extractor only handles simple column types. RECORD type columns with nested policy tags are not supported.
+* The extractor can identify specific policy tags on columns, but is limited to the information available to the bq command line tool. In it's current state, this is the full policy tag identifier:
+
+projects/<PROJECT_ID>/locations/<LOCATION>/taxonomies/<TAXONOMY_ID>/policyTags/<TAG_ID>
diff --git a/scripts/policy_tag_extractor/policy_tag_export.sh b/scripts/policy_tag_extractor/policy_tag_export.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Prompt user for DATASET value if not set 
+if [ -z "$DATASET" ]; then
+  read -p "Enter the BigQuery dataset name: " DATASET
+fi
+
+#write all tables in a dataset to a reference TXT file
+bq --format=sparse ls --max_results=10000 ${DATASET} | awk '{ print $1 }' | sed '1,2d' > table_list.txt
+
+#loop through each table and export policy tags (if any) to a CSV
+echo "Writing to CSV..."
+while IFS= read -r TABLE; do
+    TAG_COUNT="`bq show --schema ${DATASET}.${TABLE} | grep "policyTags" | wc -l`"
+
+    if [ "${TAG_COUNT}" -ge 1 ]
+    then
+        COLUMN_AND_TAG=`bq show --format=prettyjson ${DATASET}.${TABLE} | jq '.schema.fields[] | select(.policyTags | length>=1)'`
+        COLUMN=`echo $COLUMN_AND_TAG | jq '.name'`
+        TAG_ID=`echo $COLUMN_AND_TAG | jq '.policyTags.names[]'`
+        echo ${TABLE},${COLUMN},${TAG_ID} | tr -d '"'
+    fi
+done < table_list.txt >> policy_tags.csv
+echo "Done."