update.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""

    Alternative Internet Update Script
    Rolf Jagerman, Wendo Sabée, Laurens Versluis, Martijn de Vos
    TU Delft

This script reads a bunch of JSON files, finds appropriate metadata on Ohloh and writes everything neatly to a table.
The individual json files that represent the projects should have at least the following structure:

  {
    "name": "Project name",
    "description": "A description, in markdown format"
  }

This scripts adds additional ohloh metadata about the project. It is possible that this metadata is incorrect due to
Ohloh's search engine giving the wrong project. You can skip the ohloh metadata for such a project, using the
following json structure:

  {
    "name": "Project name",
    "description": "A description, in markdown format",
    "ohloh": {
      "skip": true
    }
  }

Alternatively, you can manually look up the project on Ohloh and get the correct Ohloh identifier for this project. You
can tell the script to use that identifier with the following json structure:

  {
    "name": "Project name",
    "description": "A description, in markdown format",
    "ohloh": {
      "id": "1234567"
    }
  }


Usage:

To process the JSON files in a given directory and find metadata on Ohloh, use

    python update.py -a [your-ohloh-api-key] -d [the-directory-to-store-in]

To use the default directory ("projects"), simply omit the -d parameter:

    python update.py -a [your-ohloh-api-key]

If you don't want to get information from Ohloh and just generate the table, omit the -a parameter:

    python update.py

"""

from __future__ import unicode_literals
from argparse import ArgumentParser
from os import listdir
from os.path import isfile, join
from operator import itemgetter
from collections import OrderedDict
import sys
import datetime
import logging
import codecs
import json

try:
    from urllib.request import urlopen
    from urllib.parse import urlencode
except ImportError:
    from urllib2 import urlopen
    from urllib import urlencode
try:
    from elementtree import ElementTree
except ImportError:
    from xml.etree import ElementTree


api_key = None
json_directory = 'projects'
table_file = 'README'
text_header = "# Alternative Internet\n" \
              "[Pull requests VERY welcome!](CONTRIBUTING.md)\n\n" \
              "Project statistics fetched from [Ohloh](https://www.ohloh.net).\n\n"
text_footer = ""


class SortableMarkdownTable:
    """
    Generates markdown files that represent a sortable table
    """

    def __init__(self):
        self.columns = []
        self.rows = []

    def add_column(self, title, sortable=False, suffix='', align='', width=3, reverse=False):
        """
        Adds a column to the table

        Keyword arguments:
        title -- The column's visible title
        sortable -- Whether this column is sortable (and should therefor generate a file)
        suffix -- The file suffix to write to
        align -- The column alignment
        width -- The column width
        reverse -- Whether the sorting should happen in reverse (useful for numbers)
        """
        if suffix != '':
            suffix = '_' + suffix
        self.columns.append({'title': title, 'sortable': sortable, 'reverse': reverse, 'suffix': suffix, 'align': align,
                             'width': width})

    def add_row(self, data):
        """
        Adds a row to the table

        Keyword arguments:
        data -- A list of data, the order of the data determines the column it ends up in
        """
        assert len(data) == len(self.columns)
        self.rows.append(data)

    def write_files(self, filename='README'):
        """
        Writes the table to markdown files

        Keyword arguments:
        filename -- The file to write to (without the .md extension)
        """
        for idx, column in enumerate(self.columns):
            if column['sortable']:
                self.write_file(filename, idx, column['reverse'])

    def write_file(self, filename, column_idx, reverse):
        """
        Writes a single file for a column index

        Keyword arguments:
        filename -- The file to write to
        column_idx -- The index of the column to sort on
        reverse -- Whether the sorting should happen in reverse
        """
        with codecs.open('%s%s.md' % (filename, self.columns[column_idx]['suffix']), 'w', 'utf8') as f:

            # Sort data according to column index
            data = sorted(self.rows, key=itemgetter(column_idx), reverse=reverse)

            # Write readme main text header
            f.write(text_header)

            # Write table header columns
            for idx, column in enumerate(self.columns):
                if column['sortable'] and idx != column_idx:
                    f.write('| [%s](%s) ' % (column['title'], '%s%s.md' % (filename, column['suffix'])))
                else:
                    f.write('| %s ' % column['title'])
            f.write('|\n')

            # Write spacing and alignment instructions for table
            for column in self.columns:
                f.write('| ')
                if column['align'] == 'L' or column['align'] == 'C':
                    f.write(':')
                f.write('-' * column['width'])
                if column['align'] == 'R' or column['align'] == 'C':
                    f.write(':')
                f.write(' ')
            f.write('|\n')

            # Write table data
            for row in data:
                for col in row:
                    if col is None:
                        col = '-'
                    if sys.version < '3':
                        col = unicode(col)
                    else:
                        col = str(col)
                    f.write(u'|' + col)
                f.write('|\n')

            # Write footer text
            f.write(text_footer)


def write_to_table(projects):
    """
    Writes the list of projects to several table files

    Keyword arguments:
    projects -- The list of projects to store
    """
    class OhlohValue:
        """
        Abstracts complex data and make it sortable and writable for the SortableMarkdownTable
        """
        def __init__(self, obj, value):
            if 'ohloh' in obj.keys() and value in obj['ohloh'].keys():
                self.value = obj['ohloh'][value]
            else:
                self.value = None
        def __lt__(self,other):
            if self.value == None:
                return True
            if other.value == None:
                return False
            return self.value < other.value

    class OhlohNumber(OhlohValue):
        """
        OhlohValue implementation for numeric data with a pretty unicode function for numbers that are very large
        """
        def __str__(self):
            try:
                value = float(self.value)
                sizes = ['G', 'M', 'K']
                size = ''
                while(value/(10**3) >= 1.0 and len(sizes) > 0):
                    value = value/(10**3)
                    size = sizes.pop()
                return '%d %s' % (round(value), size)
            except:
                return '-'

    class OhlohDate(OhlohValue):
        """
        OhlohValue implementation for dates with a pretty unicode function that shows the time difference
        """
        def __str__(self):
            td = datetime.datetime.now() - self.dateobj()
            if td.days < 0:
                return '-'
            if td.days < 30:  # ~one month
                return '<1 month'
            elif td.days < 356:  # ~one year
                return '%s month(s)' % int(td.days / 30)
            else:
                return '%s year(s)' % int(td.days / 356)
        def dateobj(self):
            try:
                return datetime.datetime(int(self.value[:4]), int(self.value[6:7]), int(self.value[9:10]))
            except:
                return datetime.datetime.max
        def __lt__(self,other):
            return self.dateobj() < other.dateobj()

    table = SortableMarkdownTable()

    table.add_column('Name', sortable=True, width=4)
    table.add_column('Description', width=11)
    table.add_column('Main Language',width=11, sortable=True, suffix='LANG', reverse=True)
    table.add_column('Commits', sortable=True, width=6, align='R', suffix='COMMITS', reverse=True)
    table.add_column('LOC', sortable=True, width=2, align='R', suffix='LOC', reverse=True)
    table.add_column('Total Contributors', sortable=True, width=2, align='R', suffix='CONTRIB', reverse=True)
    table.add_column('Age', sortable=True, width=2, align='R', suffix='AGE')


    for project in projects.values():
        table.add_row([project['name'],
                       project['description'],
                       OhlohValue(project,'main_language').value,
                       OhlohNumber(project, 'total_commit_count'),
                       OhlohNumber(project, 'total_code_lines'),
                       OhlohNumber(project, 'total_contributor_count'),
                       OhlohDate(project, 'min_month')
                       ])

    table.write_files(table_file)


def get_projects():
    """
    Gets a list of projects from a directory, where each json file is considered a project
    """
    projects = {}
    for file_name in listdir(json_directory):
        file_path = join(json_directory, file_name)
        if isfile(file_path) and file_path.endswith('.json'):
            print 'Loading', file_path
            projects[file_path] = json.load(open(file_path, 'r'), object_pairs_hook=OrderedDict)
    return projects


def save_project(project, file_path):
    """
    Saves given project as a json file

    Keyword arguments:
    project -- The project to store
    filename -- The filename to store the project in
    """
    json.dump(project, codecs.open(file_path, 'w', 'utf8'), indent=4)


def get_ohloh_api_request(url, api_key, params=None):
    """
    Sends an API request to Ohloh and returns the resulting xml tree or raises an exception if an error occurred.

    Keyword arguments:
    url -- The request url to get
    api_key -- The Ohloh API key to use
    params -- Additional parameters to send
    """
    parameters = {'api_key': api_key}
    if params is not None:
        for key, value in params.items():
            parameters[key] = value

    xml = urlopen('%s?%s' % (url, urlencode(parameters)))

    tree = ElementTree.parse(xml)
    error = tree.getroot().find("error")
    if error is not None:
        raise Exception(ElementTree.tostring(error))
    return tree


def search_ohloh_project(project_name):
    """
    Searches for an Ohloh project by name

    Keyword arguments:
    project_name -- The project name to search for
    """
    results = get_ohloh_api_request('https://www.ohloh.net/p.xml', api_key,
                                    {'query': project_name, 'sort': 'relevance'})
    if results.find('result/project/id') is None:
        raise Exception("Could not find project %s on Ohloh" % project_name)

    project = results.find('result/project')
    return {
        'id': project.findtext('id'),
        'name': project.findtext('name'),
        'description': project.findtext('description'),
        'analysis': project.findtext('analysis_id'),
        'tags': [tag.text for tag in project.iterfind('tags/tag')]
    }


def add_ohloh_metadata(project):
    """
    Attempts to find given project on Ohloh and adds metadata about the project

    Keyword arguments:
    project -- The Ohloh project to look for
    """
    if 'ohloh' in project.keys() and 'skip' in project['ohloh'].keys() and project['ohloh']['skip'] == True:
        project['ohloh'] = {'skip': True}
        return

    if 'ohloh' not in project.keys() or 'id' not in project['ohloh'].keys():
        project['ohloh'] = search_ohloh_project(project['name'])

    project_id = project['ohloh']['id']

    if any([e not in project['ohloh'].keys() for e in ['name', 'description', 'analysis', 'tags']]):
        results = get_ohloh_api_request('https://www.ohloh.net/p/%s.xml' % unicode(project_id), api_key)
        result = results.find('result/project')
        project['ohloh'].update({
            'id': result.findtext('id'),
            'name': result.findtext('name'),
            'description': result.findtext('description'),
            'analysis': result.findtext('analysis_id'),
            'tags': [tag.text for tag in result.iterfind('tags/tag')]
        })

    results = get_ohloh_api_request('https://www.ohloh.net/p/%s/analyses/latest.xml' % project_id, api_key)
    analysis = results.find("result/analysis")
    if analysis is None:
        raise Exception("Could not get Ohloh code analysis for project id %s" % project_id)

    project['ohloh'].update({'total_code_lines': int(analysis.findtext('total_code_lines')),
                             'total_commit_count': int(analysis.findtext('total_commit_count')),
                             'total_contributor_count': int(analysis.findtext('total_contributor_count')),
                             'twelve_month_commit_count': int(analysis.findtext('twelve_month_commit_count')),
                             'twelve_month_contributor_count': int(analysis.findtext('twelve_month_contributor_count')),
                             'updated_at': analysis.findtext('updated_at'),
                             'min_month': analysis.findtext('min_month'),
                             'max_month': analysis.findtext('max_month'),
                             'factoids': [f.text.strip() for f in analysis.iterfind('factoids/factoid')],
                             'main_language': analysis.findtext('main_language_name')})


def run_crawler():
    """
    Extracts the projects from the alternative internet page on github and downloads additional data from Ohloh.

    Keyword arguments:
    api_key -- The Ohloh API key to use
    directory -- The directory to store the resulting JSON files in
    """
    projects = get_projects()

    for file_path, project in projects.items():

        logging.info("Processing %s" % file_path)

        if api_key is not None:
            try:
                add_ohloh_metadata(project)
            except:
                logging.warning("Skipping Ohloh metadata for project %s" % project['name'])

        save_project(project, file_path)

    logging.info("Writing to tables")
    write_to_table(projects)
    logging.info("Done!")


def main():
    """
    Main entry point of the application, execution starts here
    """
    global api_key, json_directory, table_file
    logging.getLogger().setLevel(logging.INFO)
    description = 'Crawls the projects on the alternative internet github and adds additional data from Ohloh.'
    parser = ArgumentParser(description=description)

    parser.add_argument('-a', '--api', action='store', dest='api', metavar="APIKEY", default=None, required=False,
                        help='Your Ohloh API key.')

    parser.add_argument('-d', '--directory', action='store', dest='directory', metavar="projects",
                        default='projects', required=False, help='Directory where the JSON files are located.')

    parser.add_argument('-f', '--file', action='store', dest='filename', metavar="README", default='README',
                        required=False, help='File to write the output table to (without the .md extension).')

    args = parser.parse_args()

    api_key = args.api
    json_directory = args.directory
    table_file = args.filename

    run_crawler()


if __name__ == "__main__":
    main()