From 8719beab0c0f578ac441e355b26c7315fa28cb2c Mon Sep 17 00:00:00 2001
From: Seif Bassem <38246040+sebassem@users.noreply.github.com>
Date: Mon, 14 Oct 2024 18:07:28 +0300
Subject: [PATCH] feat: Add script to parse WAF security recommendations and
 generate CSV file

This commit adds a new Python script, `Parse-WAF-Security-Recommendations.py`, which is responsible for parsing the Table of Contents (TOC) YAML file and extracting security recommendations for Azure Web Application Firewall (WAF). The script uses requests, yaml, csv, markdown, and BeautifulSoup libraries to fetch the TOC, convert Markdown to HTML, and extract relevant information from the recommendation pages.

The script starts by fetching the TOC YAML file from the Azure Security Docs repository. It then recursively searches for recommendation items in the TOC and retrieves their details from the corresponding recommendation pages. For each recommendation, the script extracts the name, description, policy URL, severity, and type, and writes them to a CSV file named `recommendations.csv`.

This script will be useful for automating the extraction of WAF security recommendations and generating a CSV file for further analysis or reporting.
---
 .../Parse-WAF-Security-Recommendations.py     | 157 ++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 utilities/tools/platform/Parse-WAF-Security-Recommendations.py

diff --git a/utilities/tools/platform/Parse-WAF-Security-Recommendations.py b/utilities/tools/platform/Parse-WAF-Security-Recommendations.py
new file mode 100644
index 000000000..2885d05aa
--- /dev/null
+++ b/utilities/tools/platform/Parse-WAF-Security-Recommendations.py
@@ -0,0 +1,157 @@
+import requests
+import yaml
+import csv
+import markdown
+from bs4 import BeautifulSoup
+import re
+import time
+
+def parse_toc(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        content = response.text
+        toc = yaml.safe_load(content)
+
+        def find_recommendations(items):
+            results = []
+            for item in items:
+                if 'items' in item:
+                    results.extend(find_recommendations(item['items']))
+                elif 'href' in item and item['href'].startswith('recommendations-reference'):
+                    results.append(item)
+            return results
+
+        recommendations = find_recommendations(toc)
+        print(f"Found {len(recommendations)} recommendation items in TOC")
+        return recommendations
+    except Exception as e:
+        print(f"Error parsing TOC: {str(e)}")
+        return []
+
+def parse_recommendation_page(url):
+    try:
+        print(f"Fetching page: {url}")
+        response = requests.get(url)
+        response.raise_for_status()
+        content = response.text
+        
+        # Convert Markdown to HTML
+        html_content = markdown.markdown(content)
+        
+        soup = BeautifulSoup(html_content, 'html.parser')
+        
+        # Find all h2 tags that contain "Azure"
+        azure_h2_tags = [h2 for h2 in soup.find_all('h2') if 'Azure' in h2.text]
+        
+        if not azure_h2_tags:
+            print("No h2 tag with 'Azure' found")
+            return None
+
+        recommendations = []
+        
+        for h2 in azure_h2_tags:
+            name = h2.text.strip()
+            print(f"Found name: {name}")
+            
+            # Find all h3 tags that follow this h2 until the next h2
+            h3_tags = []
+            next_element = h2.next_sibling
+            while next_element and next_element.name != 'h2':
+                if next_element.name == 'h3':
+                    h3_tags.append(next_element)
+                next_element = next_element.next_sibling
+            
+            for h3 in h3_tags:
+                description = h3.text.strip()
+                print(f"Found description: {description}")
+                
+                # Find the next sibling elements until the next h3 or h2
+                next_element = h3.next_sibling
+                policy_url = ''
+                severity = ''
+                type_info = ''
+                additional_info = []
+                
+                while next_element and next_element.name not in ['h3', 'h2']:
+                    if isinstance(next_element, str):
+                        text = next_element.strip()
+                        if text:
+                            additional_info.append(text)
+                    elif next_element.name in ['p', 'li']:
+                        text = next_element.text.strip()
+                        if text:
+                            additional_info.append(text)
+                    
+                    next_element = next_element.next_sibling
+                
+                # Join additional info and search for severity, type, and policy URL
+                additional_text = ' '.join(additional_info)
+                
+                severity_match = re.search(r'Severity:\s*(\w+)', additional_text)
+                if severity_match:
+                    severity = severity_match.group(1)
+                    print(f"Found severity: {severity}")
+                
+                type_match = re.search(r'Type:\s*(.+?)(?=\n|$)', additional_text)
+                if type_match:
+                    type_info = type_match.group(1).strip()
+                    print(f"Found type: {type_info}")
+                
+                # Extract policy URL
+                policy_match = re.search(r'Related policy: \[.+?\]\((https://portal\.azure\.com/#blade/Microsoft_Azure_Policy/PolicyDetailBlade/definitionId/[^)]+)\)', additional_text)
+                if policy_match:
+                    policy_url = policy_match.group(1)
+                    print(f"Found policy URL: {policy_url}")
+                
+                recommendations.append({
+                    'name': name,
+                    'description': description,
+                    'policy_url': policy_url,
+                    'severity': severity,
+                    'type': type_info
+                })
+        
+        return recommendations
+    except Exception as e:
+        print(f"Error parsing page {url}: {str(e)}")
+        return None
+
+# URL of the TOC.yml file
+toc_url = "https://raw.githubusercontent.com/MicrosoftDocs/azure-security-docs/refs/heads/main/articles/defender-for-cloud/TOC.yml"
+
+# Parse the TOC and get recommendations
+recommendation_items = parse_toc(toc_url)
+
+if not recommendation_items:
+    print("No recommendation items found. Exiting.")
+    exit()
+
+# Prepare CSV file
+csv_filename = 'recommendations.csv'
+csv_headers = ['Name', 'Description', 'Policy URL', 'Severity', 'Type']
+
+with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
+    writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
+    writer.writeheader()
+
+    for item in recommendation_items:
+        page_url = f"https://raw.githubusercontent.com/MicrosoftDocs/azure-security-docs/refs/heads/main/articles/defender-for-cloud/{item['href']}"
+        page_data = parse_recommendation_page(page_url)
+        
+        if page_data:
+            for recommendation in page_data:
+                writer.writerow({
+                    'Name': recommendation['name'],
+                    'Description': recommendation['description'],
+                    'Policy URL': recommendation['policy_url'],
+                    'Severity': recommendation['severity'],
+                    'Type': recommendation['type']
+                })
+                print(f"Processed and wrote to CSV: {recommendation['description']}")
+        else:
+            print(f"Failed to process: {item['name']}")
+        
+        time.sleep(1)
+
+print(f"CSV file '{csv_filename}' has been created with the extracted information.")