From ef8a912dfbe80fed9b40aa1cc626e3a4a5c9681d Mon Sep 17 00:00:00 2001 From: Nick Burgan <122842472+nb1701@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:56:02 -0700 Subject: [PATCH] Add ability to force unlock TF state and fix digest value of lock file in DynamoDB (#235) --- bin/fmt | 32 ++++ cloud/aws/templates/aws_oidc/bin/aws_cli.py | 29 +++- cloud/shared/bin/lib/terraform.py | 157 ++++++++++++++++++-- cloud/shared/bin/run | 14 +- cloud/shared/bin/run.py | 22 +++ 5 files changed, 233 insertions(+), 21 deletions(-) create mode 100755 bin/fmt diff --git a/bin/fmt b/bin/fmt new file mode 100755 index 00000000..ae38776e --- /dev/null +++ b/bin/fmt @@ -0,0 +1,32 @@ +#! /usr/bin/env bash + +# DOC: Format terraform, python, and shell script files. + +if which -s terraform; then + echo "Formatting Terraform files" + terraform fmt -recursive -write +else + echo "Can not find the terraform binary. Please install Terraform first." + exit 1 +fi + +if which -s shfmt; then + echo "Formatting shell scripts" + shfmt -bn -ci -i 2 -w -l $(shfmt -f .) +else + echo "Could not find the shfmt binary. Please install shfmt first." + exit 1 +fi + +if which -s yapf; then + echo "Formatting python files" + yapf \ + --verbose \ + --style='{based_on_style: google, SPLIT_BEFORE_FIRST_ARGUMENT:true}' \ + --in-place \ + --recursive \ + . +else + echo "Could not find yapf. Please install it first." + exit 1 +fi diff --git a/cloud/aws/templates/aws_oidc/bin/aws_cli.py b/cloud/aws/templates/aws_oidc/bin/aws_cli.py index 36b7d9a5..cb12d48a 100644 --- a/cloud/aws/templates/aws_oidc/bin/aws_cli.py +++ b/cloud/aws/templates/aws_oidc/bin/aws_cli.py @@ -109,6 +109,24 @@ def wait_for_ecs_service_healthy(self): ) time.sleep(30) + def set_lock_table_digest_value(self, value): + """ + Sets the lock file digest value in DynamoDB to the given value. This + digest value is a checksum of the Terraform state file stored in S3. + + If something goes wrong during deployment, especially when a user has + force-unlocked due to a previous issue and then multiple apply actions + are happening at once, the digest value for the Terraform lock file in + S3 can be incorrect. This function lets us set the digest value to + the correct value, as given by the error message of a previous + Terraform command, without having to go into the AWS console to + set it manually. + """ + table = f'{self.config.app_prefix}-{resources.S3_TERRAFORM_LOCK_TABLE}' + file = f'{self.config.app_prefix}-{resources.S3_TERRAFORM_STATE_BUCKET}' + command = f'dynamodb put-item --table-name={table} --item=\'{{"LockID":{{"S":"{file}/tfstate/terraform.tfstate-md5"}},"Digest":{{"S":"{value}"}}}}\'' + self._call_cli(command, False) # output = False + def _ecs_service_state(self) -> Dict: """ Returns the ID and rolloutState of the PRIMARY ECS service deployment. If @@ -169,7 +187,12 @@ def get_url_of_secret(self, secret_name: str) -> str: def get_url_of_s3_bucket(self, bucket_name: str) -> str: return f"https://{self.config.aws_region}.console.aws.amazon.com/s3/buckets/{bucket_name}" - def _call_cli(self, command: str) -> Dict: - command = f"aws --output=json --region={self.config.aws_region} " + command + def _call_cli(self, command: str, output: bool = True) -> Dict: + base = f"aws --region={self.config.aws_region} " + if output: + base += "--output=json " + command = base + command out = subprocess.check_output(shlex.split(command)) - return json.loads(out.decode("ascii")) + if output: + return json.loads(out.decode("ascii")) + return diff --git a/cloud/shared/bin/lib/terraform.py b/cloud/shared/bin/lib/terraform.py index 56554f87..1a378ac9 100644 --- a/cloud/shared/bin/lib/terraform.py +++ b/cloud/shared/bin/lib/terraform.py @@ -1,37 +1,119 @@ import subprocess import os +import sys +import re import shutil import shlex +import inspect from typing import Optional from cloud.shared.bin.lib.config_loader import ConfigLoader from cloud.shared.bin.lib.print import print +from cloud.aws.templates.aws_oidc.bin.aws_cli import AwsCli -# TODO(#2741): When using this for Azure make sure to setup backend bucket prior to calling these functions. -def perform_apply( +def force_unlock( config_loader: ConfigLoader, - is_destroy=False, - terraform_template_dir: Optional[str] = None): - '''Generates terraform variable files and runs terraform init and apply.''' + lock_id: str, + terraform_template_dir: Optional[str] = None, + initialize=True): if not terraform_template_dir: terraform_template_dir = config_loader.get_template_dir() - tf_vars_filename = config_loader.tfvars_filename - terraform_cmd = f'terraform -chdir={terraform_template_dir}' + if initialize: + perform_init( + config_loader, terraform_template_dir, False) # upgrade = False + + terraform_cmd = f'terraform -chdir={terraform_template_dir} force-unlock -force {lock_id}' + print(f" - Run {terraform_cmd}") + subprocess.check_call(shlex.split(terraform_cmd)) + + +def perform_init( + config_loader: ConfigLoader, + terraform_template_dir: Optional[str] = None, + upgrade: bool = True): + if not terraform_template_dir: + terraform_template_dir = config_loader.get_template_dir() + + init_cmd = f'terraform -chdir={terraform_template_dir} init' + if upgrade: + init_cmd += ' -upgrade' if config_loader.use_local_backend: - print(' - Run terraform init -upgrade -reconfigure') - subprocess.check_call( - shlex.split(f'{terraform_cmd} init -upgrade -reconfigure')) + init_cmd += ' -reconfigure' else: - print(' - Run terraform init -upgrade') - init_cmd = f'{terraform_cmd} init -input=false -upgrade' + init_cmd += ' -input=false' # backend vars file can be absent when pre-terraform setup is running if os.path.exists(os.path.join(terraform_template_dir, config_loader.backend_vars_filename)): init_cmd += f' -backend-config={config_loader.backend_vars_filename}' - subprocess.check_call(shlex.split(init_cmd)) + print(f" - Run {init_cmd}") + output, exit_code = capture_stderr(init_cmd) + if exit_code > 0: + # Determine if we're running interactively + is_tty = sys.stdin.isatty() + # This is AWS-specific, and should be modified when we have actual + # Azure deployments + if 'state data in S3 does not have the expected content' in output: + match = re.search(r'value: ([0-9a-f]{32})', output) + if match: + digest = match.group(match.lastindex) + if is_tty: + answer = input( + "Would you like to fix this by setting the correct digest value? Ensure that no other deployment processes are in progress. [Y/n] >" + ) + if answer.lower() in ['y', 'yes', '']: + aws = AwsCli(config_loader) + aws.set_lock_table_digest_value(digest) + perform_init( + config_loader, terraform_template_dir, upgrade) + return + print( + f"To fix the above error, rerun the command with LOCK_TABLE_DIGEST_VALUE=\"{digest}\" before it." + ) + # Since we've handled the error and printed a message, exit immediately + # rather than returning False and having it print a stack trace. + exit(exit_code) + raise RuntimeError( + "Unhandled error during terraform init. See error message above for details." + ) + + +# We specifically don't want to capture stdout here. When running in interactive mode, +# we'd miss the prompt to enter "yes" to continue on a terraform apply, even if we're +# printing each line as it comes in, since the line the prompt is on does not contain +# a new line character. +def capture_stderr(cmd): + popen = subprocess.Popen( + shlex.split(cmd), + stderr=subprocess.PIPE, + bufsize=1, + universal_newlines=True) + try: + exit_code = popen.wait() + _, stderr = popen.communicate() + if stderr: + print(stderr) + return stderr, exit_code + except KeyboardInterrupt: + # Allow terraform to gracefully exit if a user Ctrl+C's out of the command + popen.terminate() + + +# TODO(#2741): When using this for Azure make sure to setup backend bucket prior to calling these functions. +def perform_apply( + config_loader: ConfigLoader, + is_destroy=False, + terraform_template_dir: Optional[str] = None, + initialize=True): + '''Generates terraform variable files and runs terraform init and apply.''' + if not terraform_template_dir: + terraform_template_dir = config_loader.get_template_dir() + tf_vars_filename = config_loader.tfvars_filename + + if initialize: + perform_init(config_loader, terraform_template_dir) if os.path.exists(os.path.join(terraform_template_dir, tf_vars_filename)): print( @@ -45,16 +127,59 @@ def perform_apply( print(" - Test. Not applying terraform.") return True - print(" - Run terraform apply") # Enable compact-warnings as we have a bunch of # "value of undeclared variables" warnings as some variables used in one # deployment (e.g. aws) but not the other. - terraform_apply_cmd = f'{terraform_cmd} apply -input=false -var-file={tf_vars_filename} -compact-warnings' + terraform_apply_cmd = f'terraform -chdir={terraform_template_dir} apply -input=false -var-file={tf_vars_filename} -compact-warnings' if config_loader.skip_confirmations: terraform_apply_cmd += ' -auto-approve' if is_destroy: terraform_apply_cmd += ' -destroy' - subprocess.check_call(shlex.split(terraform_apply_cmd)) + + print(f" - Run {terraform_apply_cmd}") + + output, exit_code = capture_stderr(terraform_apply_cmd) + if exit_code > 0: + # Determine if we're running interactively + is_tty = sys.stdin.isatty() + if "Error acquiring the state lock" in output: + # Lock ID is a standard UUID v4 in the form 00000000-0000-0000-0000-000000000000 + match = re.search( + r'ID:\s+([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', + output) + error_text = inspect.cleandoc( + """ + The Terraform state lock can not be acquired. + This can happen if you are running a command in another process, or if another Terraform process exited prematurely. + """) + if match: + lock_id = match.group(match.lastindex) + if is_tty: + answer = input( + "Would you like to fix this by force-unlocking the Terraform state? Ensure that no other deployment processes are in progress. [Y/n] >" + ) + if answer.lower() in ['y', 'yes', '']: + force_unlock( + config_loader, lock_id, terraform_template_dir, + False) # initialize = False + return perform_apply( + config_loader, is_destroy, terraform_template_dir, + False) # initialize = False + print( + error_text + + f"\nIf you are sure there are no other Terraform processes running, this can be fixed by rerunning the command with FORCE_UNLOCK_ID=\"{lock_id}\" before it." + ) + else: + print( + error_text + + "\nWe were unable to extract the lock ID from the error text. Inspect the error message above." + "\nIf you are sure there are no other Terraform processes running, this error can be fixed by rerunning the command with FORCE_UNLOCK_ID= before it." + ) + # Since we've handled the error and printed a message, exit immediately + # rather than returning False and having it print a stack trace. + exit(exit_code) + return False + return True diff --git a/cloud/shared/bin/run b/cloud/shared/bin/run index eaccba21..b2d2cec4 100755 --- a/cloud/shared/bin/run +++ b/cloud/shared/bin/run @@ -19,7 +19,7 @@ set -o pipefail source cloud/shared/bin/python_env_setup # Get the arguments that we want to pass to run.py -while getopts s:c:t: flag; do +while getopts s:c:t:u:d: flag; do case "${flag}" in # The civiform_config file that contains the values to configure the deployment s) source_config=${OPTARG} ;; @@ -115,4 +115,14 @@ echo "env-var-docs @ git+https://github.com/civiform/civiform.git@${commit_sha}\ initialize_python_env $dependencies_file_path -cloud/shared/bin/run.py --command $command --tag $tag --config $source_config +args=("--command" "${command}" "--tag" "${tag}" "--config" "${source_config}") + +if [[ -n "${FORCE_UNLOCK_ID}" ]]; then + args=("${args[@]}" "--force-unlock" "${FORCE_UNLOCK_ID}") +fi + +if [[ -n "${LOCK_TABLE_DIGEST_VALUE}" ]]; then + args=("${args[@]}" "--lock-table-digest-value" "${LOCK_TABLE_DIGEST_VALUE}") +fi + +cloud/shared/bin/run.py "${args[@]}" diff --git a/cloud/shared/bin/run.py b/cloud/shared/bin/run.py index 63a47917..359c683f 100755 --- a/cloud/shared/bin/run.py +++ b/cloud/shared/bin/run.py @@ -14,6 +14,8 @@ from cloud.shared.bin.lib.print import print from cloud.shared.bin.lib.write_tfvars import TfVarWriter from cloud.shared.bin.lib import backend_setup +from cloud.shared.bin.lib import terraform +from cloud.aws.templates.aws_oidc.bin.aws_cli import AwsCli _CIVIFORM_RELEASE_TAG_REGEX = re.compile(r'^v?[0-9]+\.[0-9]+\.[0-9]+$') @@ -32,6 +34,14 @@ def main(): '--config', default='civiform_config.sh', help='Path to CiviForm deployment config file.') + parser.add_argument( + '--force-unlock', + help='Lock ID to force unlock before performing the Terraform apply.') + parser.add_argument( + '--lock-table-digest-value', + help= + 'Digest value for the Terraform lock table to set in DynamoDB. If multiple processes are doing a deploy, or an error occurred in a previous deploy that prevented Terraform from cleaning up after itself, this value may need updating. Only works on AWS deployments.' + ) args = parser.parse_args() if args.tag: @@ -55,6 +65,18 @@ def main(): # Setup backend backend_setup.setup_backend(config) + # Run the command to force unlock the TF state lock + if args.force_unlock: + print("Force unlocking the Terraform state") + terraform.force_unlock(config, args.force_unlock) + + if args.lock_table_digest_value: + print( + f"Fixing the lock file digest value in DynamoDB, setting it to {args.lock_table_digest_value}" + ) + aws = AwsCli(config) + aws.set_lock_table_digest_value(args.lock_table_digest_value) + # Write the passthrough vars to a temporary file print("Writing TF Vars file") terraform_tfvars_path = os.path.join(