From 2b3d3ea30b6cd32b3b40b0cdc27b9111ef25129f Mon Sep 17 00:00:00 2001 From: searchsolved <72097373+searchsolved@users.noreply.github.com> Date: Sat, 3 Jul 2021 12:30:48 +0100 Subject: [PATCH] https://twitter.com/LeeFootSEO --- Best_Selling_Products_to_XML.ipynb | 250 +++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 Best_Selling_Products_to_XML.ipynb diff --git a/Best_Selling_Products_to_XML.ipynb b/Best_Selling_Products_to_XML.ipynb new file mode 100644 index 0000000..9c47aa8 --- /dev/null +++ b/Best_Selling_Products_to_XML.ipynb @@ -0,0 +1,250 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Best Selling Products to XML.ipynb", + "provenance": [], + "collapsed_sections": [], + "authorship_tag": "ABX9TyOVYQuIs3u3dIfzdQuBjk//", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R28dUAVvCrtg" + }, + "source": [ + "# Convert Best Selling Products (Transactions or Revenue) to An XML Sitemap\n", + "Why? Uploading an XML sitemap allows you to retrieve detailed Search Coverage data for your best performing URLs).\n", + "\n", + "This means you can spot and resolve any indexing issues and make more money. (e.g. it could be that your products dominate adwords, but other issues are holding it back from organic). \n", + "\n", + "## How to use:\n", + "\n", + "1. Download a Landing Page Report (In Excel Format) from Google Analytics (Behaviour > Site Content > Landing Pages).\n", + "2. Specify the domain name in the cell below\n", + "3. Run all cells from the Runtime menu above and upload the Analytics export when prompted.\n", + "\n", + "## Options:\n", + "\n", + "* Specify the domain of your Website. (Required)\n", + "* Specify Top X Percent of Transactions / Revenue\n", + "* Choice of Transactions of Revenue\n", + "\n", + "## Output\n", + "* XML Sitemap - ready to upload to Search Console.\n", + "\n", + "https://twitter.com/LeeFootSEO <--- Follow for more scripts like this." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4GkFqBc2FNT0" + }, + "source": [ + "# Set All Options Below - Make Sure to Correctly Set Your Domain!" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ky2SN0LjFDhd" + }, + "source": [ + "domain = \"https://www.example.com\"\n", + "drop_words = \"checkout|basket|paypal|search|account|(not set)\"\n", + "select_on = \"Transactions\"\n", + "#select_on = \"Revenue\"\n", + "top_percent = 5 # set the top X percent to keep" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "0uSI80-2FRW8" + }, + "source": [ + "!pip install pandas" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "UB4Z_qoID97h" + }, + "source": [ + "import urllib.parse as urlparse\n", + "from glob import glob\n", + "\n", + "import pandas as pd\n", + "from jinja2 import Template\n", + "from google.colab import files" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tlXKgUOCETRi" + }, + "source": [ + "# Upload a Landing Page Report from GA - Must be an Excel File Export - csv not accepted!\n", + "uploaded = files.upload()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "p7kFMe4qFvRh" + }, + "source": [ + "filename = list(uploaded.keys())[0] # get the filename from the upload\n", + "df = pd.read_excel(filename, 'Dataset1') # choose the right sheet from Excel" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nyh7vzSgF25a" + }, + "source": [ + "# drop NaNs, force all values to str, append the domain and parse out the urls\n", + "df = df[df[\"Landing Page\"].notna()]\n", + "df['Landing Page'] = df['Landing Page'].astype(str)\n", + "df['Landing Page'] = domain + (df['Landing Page'])\n", + "df[\"protocol\"], df[\"domain\"], df[\"path\"], df[\"query\"], df[\"fragment\"] = zip(*df[\"Landing Page\"].map(urlparse.urlsplit))\n", + "df['path'] = df['path'].str.split('&').str[0] # Split / remove anything with an & symbol" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "QRHQ5gq8GEp8" + }, + "source": [ + "# calculate top x percentage of rows and drop anything below threshold\n", + "count_rows = df.shape[0]\n", + "df[select_on] = df[select_on].astype(int)\n", + "df = df.sort_values(by=select_on, ascending=False)\n", + "drop_rows = top_percent * count_rows / 100\n", + "drop_rows = int(drop_rows)\n", + "print(\"Creating XML Sitemap for top\", drop_rows, \"Rows by\", select_on)\n", + "df = df[df.index < drop_rows]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dwR0qoRRGUiB" + }, + "source": [ + "# drop urls that match the drop_words value. (Useful to block checkout, baskets URLs etc) - then drop duplicates\n", + "df = df[~df[\"path\"].isin([\"/\"])]\n", + "df = df[~df[\"path\"].str.contains(drop_words, na=False)]\n", + "df.drop_duplicates(subset=\"path\", inplace=True)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "olSI8AESGXAh" + }, + "source": [ + "# get today's date and add to new column (used for )\n", + "df['Date'] = pd.date_range('today', periods=len(df), freq='D').normalize()\n", + "df['path'] = domain + df['path'] # append the domain in before creating the sitemap" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "cJqDzHmRGYfd" + }, + "source": [ + "# make a simple df with two column values for itertuples\n", + "df_sitemap = df[['path', 'Date']]\n", + "df_sitemap = df_sitemap.set_index(['path'])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "hZQaZImyGZf3" + }, + "source": [ + "# make the sitemap and print a sample of the output\n", + "sitemap_template = \"\"\"\n", + "\n", + " {% for page in pages %}\n", + " \n", + " {{page[0]|safe}}\n", + " {{page[1]}}\n", + " 0.80\n", + " \n", + " \n", + " {% endfor %}\n", + "\"\"\"\n", + "\n", + "template = Template(sitemap_template)\n", + "xml_sitemap_output = template.render(pages=df_sitemap.itertuples())\n", + "\n", + "print(xml_sitemap_output[:500])\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "R2cMEPYJGjvv" + }, + "source": [ + "with open(\"high_value_landing_pages.xml\", \"w\") as fh:\n", + " fh.write(xml_sitemap_output)\n", + "files.download(\"high_value_landing_pages.xml\")" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file