forked from ranksense/Twittorials
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
49e09f5
commit 2b3d3ea
Showing
1 changed file
with
250 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"name": "Best Selling Products to XML.ipynb", | ||
"provenance": [], | ||
"collapsed_sections": [], | ||
"authorship_tag": "ABX9TyOVYQuIs3u3dIfzdQuBjk//", | ||
"include_colab_link": true | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "view-in-github", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"<a href=\"https://colab.research.google.com/github/searchsolved/Twittorials/blob/master/Best_Selling_Products_to_XML.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "R28dUAVvCrtg" | ||
}, | ||
"source": [ | ||
"# Convert Best Selling Products (Transactions or Revenue) to An XML Sitemap\n", | ||
"Why? Uploading an XML sitemap allows you to retrieve detailed Search Coverage data for your best performing URLs).\n", | ||
"\n", | ||
"This means you can spot and resolve any indexing issues and make more money. (e.g. it could be that your products dominate adwords, but other issues are holding it back from organic). \n", | ||
"\n", | ||
"## How to use:\n", | ||
"\n", | ||
"1. Download a Landing Page Report (In Excel Format) from Google Analytics (Behaviour > Site Content > Landing Pages).\n", | ||
"2. Specify the domain name in the cell below\n", | ||
"3. Run all cells from the Runtime menu above and upload the Analytics export when prompted.\n", | ||
"\n", | ||
"## Options:\n", | ||
"\n", | ||
"* Specify the domain of your Website. (Required)\n", | ||
"* Specify Top X Percent of Transactions / Revenue\n", | ||
"* Choice of Transactions of Revenue\n", | ||
"\n", | ||
"## Output\n", | ||
"* XML Sitemap - ready to upload to Search Console.\n", | ||
"\n", | ||
"https://twitter.com/LeeFootSEO <--- Follow for more scripts like this." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "4GkFqBc2FNT0" | ||
}, | ||
"source": [ | ||
"# Set All Options Below - Make Sure to Correctly Set Your Domain!" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "ky2SN0LjFDhd" | ||
}, | ||
"source": [ | ||
"domain = \"https://www.example.com\"\n", | ||
"drop_words = \"checkout|basket|paypal|search|account|(not set)\"\n", | ||
"select_on = \"Transactions\"\n", | ||
"#select_on = \"Revenue\"\n", | ||
"top_percent = 5 # set the top X percent to keep" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "0uSI80-2FRW8" | ||
}, | ||
"source": [ | ||
"!pip install pandas" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "UB4Z_qoID97h" | ||
}, | ||
"source": [ | ||
"import urllib.parse as urlparse\n", | ||
"from glob import glob\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"from jinja2 import Template\n", | ||
"from google.colab import files" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "tlXKgUOCETRi" | ||
}, | ||
"source": [ | ||
"# Upload a Landing Page Report from GA - Must be an Excel File Export - csv not accepted!\n", | ||
"uploaded = files.upload()" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "p7kFMe4qFvRh" | ||
}, | ||
"source": [ | ||
"filename = list(uploaded.keys())[0] # get the filename from the upload\n", | ||
"df = pd.read_excel(filename, 'Dataset1') # choose the right sheet from Excel" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "nyh7vzSgF25a" | ||
}, | ||
"source": [ | ||
"# drop NaNs, force all values to str, append the domain and parse out the urls\n", | ||
"df = df[df[\"Landing Page\"].notna()]\n", | ||
"df['Landing Page'] = df['Landing Page'].astype(str)\n", | ||
"df['Landing Page'] = domain + (df['Landing Page'])\n", | ||
"df[\"protocol\"], df[\"domain\"], df[\"path\"], df[\"query\"], df[\"fragment\"] = zip(*df[\"Landing Page\"].map(urlparse.urlsplit))\n", | ||
"df['path'] = df['path'].str.split('&').str[0] # Split / remove anything with an & symbol" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "QRHQ5gq8GEp8" | ||
}, | ||
"source": [ | ||
"# calculate top x percentage of rows and drop anything below threshold\n", | ||
"count_rows = df.shape[0]\n", | ||
"df[select_on] = df[select_on].astype(int)\n", | ||
"df = df.sort_values(by=select_on, ascending=False)\n", | ||
"drop_rows = top_percent * count_rows / 100\n", | ||
"drop_rows = int(drop_rows)\n", | ||
"print(\"Creating XML Sitemap for top\", drop_rows, \"Rows by\", select_on)\n", | ||
"df = df[df.index < drop_rows]" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "dwR0qoRRGUiB" | ||
}, | ||
"source": [ | ||
"# drop urls that match the drop_words value. (Useful to block checkout, baskets URLs etc) - then drop duplicates\n", | ||
"df = df[~df[\"path\"].isin([\"/\"])]\n", | ||
"df = df[~df[\"path\"].str.contains(drop_words, na=False)]\n", | ||
"df.drop_duplicates(subset=\"path\", inplace=True)" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "olSI8AESGXAh" | ||
}, | ||
"source": [ | ||
"# get today's date and add to new column (used for <lastmod>)\n", | ||
"df['Date'] = pd.date_range('today', periods=len(df), freq='D').normalize()\n", | ||
"df['path'] = domain + df['path'] # append the domain in before creating the sitemap" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "cJqDzHmRGYfd" | ||
}, | ||
"source": [ | ||
"# make a simple df with two column values for itertuples\n", | ||
"df_sitemap = df[['path', 'Date']]\n", | ||
"df_sitemap = df_sitemap.set_index(['path'])" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "hZQaZImyGZf3" | ||
}, | ||
"source": [ | ||
"# make the sitemap and print a sample of the output\n", | ||
"sitemap_template = \"\"\"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n", | ||
"<urlset xmlns = \"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n", | ||
" {% for page in pages %}\n", | ||
" <url>\n", | ||
" <loc>{{page[0]|safe}}</loc>\n", | ||
" <lastmod>{{page[1]}}</lastmod>\n", | ||
" <priority>0.80</priority>\n", | ||
" \n", | ||
" </url>\n", | ||
" {% endfor %}\n", | ||
"</urlset>\"\"\"\n", | ||
"\n", | ||
"template = Template(sitemap_template)\n", | ||
"xml_sitemap_output = template.render(pages=df_sitemap.itertuples())\n", | ||
"\n", | ||
"print(xml_sitemap_output[:500])\n" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "R2cMEPYJGjvv" | ||
}, | ||
"source": [ | ||
"with open(\"high_value_landing_pages.xml\", \"w\") as fh:\n", | ||
" fh.write(xml_sitemap_output)\n", | ||
"files.download(\"high_value_landing_pages.xml\")" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
] | ||
} |