Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] New version with cleaner options #162

Merged
merged 29 commits into from
Mar 20, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a5b1f9a
WIP - New version with cleaner options
pjbull Mar 22, 2019
9d3cb0f
Fix find-replace error (#177)
jamesmyatt Aug 5, 2019
af9fb20
Remove unnecessary .gitkeep
pjbull Feb 10, 2020
142dbe0
Remove unused tox.ini
pjbull Feb 10, 2020
e255c1f
Split reqs into dev/non-dev
pjbull Feb 13, 2020
9ebe237
Add basic packages support
pjbull Feb 13, 2020
481812a
Add tests for testing environment creation and requirements
pjbull Feb 13, 2020
10639e7
Set up CI with Azure Pipelines (#194)
drivendata Jul 14, 2020
28d07cb
Merge branch 'master' into new-cli
pjbull Jul 14, 2020
0c794d3
More graceful deprecation
pjbull Nov 6, 2020
fc41bec
Make tests pass locally
pjbull Nov 6, 2020
ed0340c
test version match installed version
pjbull Nov 7, 2020
f489372
Remove unused imports
pjbull Nov 7, 2020
ddc11ad
Unremove used import
pjbull Nov 7, 2020
d1bf5c0
Move to GH Actions
pjbull Nov 8, 2020
782e42e
Fix typo
pjbull Nov 8, 2020
4f5516b
Test non-windows
pjbull Nov 11, 2020
3b371aa
Add netlify configs
r-b-g-b Dec 4, 2020
5f0ef35
Update suggestion to keep using deprecated cookiecutter template (#231)
r-b-g-b Dec 4, 2020
f5ddcaa
Add mkdocs requirements file to docs directory
r-b-g-b Dec 4, 2020
39c4bff
Try setting python version in runtime txt for netlify
r-b-g-b Dec 4, 2020
a648308
Trigger build
r-b-g-b Dec 4, 2020
f273c5f
Python 3.8 netlify
r-b-g-b Dec 4, 2020
50d7634
Python 3.6 netlify
r-b-g-b Dec 4, 2020
898d7d3
Do not specify python runtime for netlify
r-b-g-b Dec 4, 2020
7fdf857
Use 3.7
r-b-g-b Dec 4, 2020
8154bb8
Merge pull request #1 from r-b-g-b/netlify-docs
r-b-g-b Dec 4, 2020
4c82643
Merge pull request #232 from r-b-g-b/new-cli-netlify-docs
drivendata Dec 4, 2020
f1167e9
Merge branch 'v2' into new-cli
pjbull Mar 20, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,9 @@ docs/site/
# test cache
.cache/*
tests/__pycache__/*
*.pytest_cache/
*.pytest_cache/
*.pyc

# other local dev info
.vscode/
cookiecutter_data_science.egg-info/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ The directory structure of your new project looks like this:
├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
│ generated with `pip freeze > requirements.txt`
├── src <- Source code for use in this project.
│ ├── __init__.py <- Makes src a Python module
├── {{ cookiecutter.module_name }} <- Source code for use in this project.
│ ├── __init__.py <- Makes {{ cookiecutter.module_name }} a Python module
│ │
│ ├── data <- Scripts to download or generate data
│ │ └── make_dataset.py
Expand Down
31 changes: 31 additions & 0 deletions ccds/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import json
from pathlib import Path
import re
import sys

# 2/3 compat
try:
input = raw_input
except NameError:
pass

import click

# Monkey-patch jinja to allow variables to not exist, which happens with sub-options
import jinja2
jinja2.StrictUndefined = jinja2.Undefined


# Monkey-patch cookiecutter to allow sub-items
import cookiecutter
from cookiecutter import prompt
from ccds.monkey_patch import prompt_for_config

prompt.prompt_for_config = prompt_for_config

from cookiecutter import cli
main = cli.main


if __name__ == "__main__":
main()
156 changes: 156 additions & 0 deletions ccds/monkey_patch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from collections import OrderedDict
import json

import click
from past.builtins import basestring

from future.utils import iteritems

from jinja2.exceptions import UndefinedError

from cookiecutter.exceptions import UndefinedVariableInTemplate
from cookiecutter.environment import StrictEnvironment


from cookiecutter.prompt import (prompt_choice_for_config, render_variable, read_user_variable, read_user_choice)

def _prompt_choice_and_subitems(cookiecutter_dict, env, key, options, no_input):
result = {}

# first, get the selection
rendered_options = [
render_variable(env, list(raw.keys())[0], cookiecutter_dict) for raw in options
]

if no_input:
selected = rendered_options[0]

selected = read_user_choice(key, rendered_options)

selected_item = [list(c.values())[0] for c in options if list(c.keys())[0] == selected][0]

result[selected] = {}

# then, fill in the sub values for that item
for subkey, raw in selected_item.items():
# We are dealing with a regular variable
val = render_variable(env, raw, cookiecutter_dict)

if not no_input:
val = read_user_variable(subkey, val)

result[selected][subkey] = val

return result


def prompt_for_config(context, no_input=False):
"""
Prompts the user to enter new config, using context as a source for the
field names and sample values.
:param no_input: Prompt the user at command line for manual configuration?
"""
cookiecutter_dict = OrderedDict([])
env = StrictEnvironment(context=context)

# First pass: Handle simple and raw variables, plus choices.
# These must be done first because the dictionaries keys and
# values might refer to them.
for key, raw in iteritems(context[u'cookiecutter']):
if key.startswith(u'_'):
cookiecutter_dict[key] = raw
continue

try:
if isinstance(raw, list):
if isinstance(raw[0], dict):
val = _prompt_choice_and_subitems(
cookiecutter_dict, env, key, raw, no_input
)
cookiecutter_dict[key] = val
else:
# We are dealing with a choice variable
val = prompt_choice_for_config(
cookiecutter_dict, env, key, raw, no_input
)
cookiecutter_dict[key] = val
elif not isinstance(raw, dict):
# We are dealing with a regular variable
val = render_variable(env, raw, cookiecutter_dict)

if not no_input:
val = read_user_variable(key, val)

cookiecutter_dict[key] = val
except UndefinedError as err:
msg = "Unable to render variable '{}'".format(key)
raise UndefinedVariableInTemplate(msg, err, context)

# Second pass; handle the dictionaries.
for key, raw in iteritems(context[u'cookiecutter']):

try:
if isinstance(raw, dict):
# We are dealing with a dict variable
val = render_variable(env, raw, cookiecutter_dict)

if not no_input:
val = read_user_dict(key, val)

cookiecutter_dict[key] = val
except UndefinedError as err:
msg = "Unable to render variable '{}'".format(key)
raise UndefinedVariableInTemplate(msg, err, context)

return cookiecutter_dict

# from cookiecutter.main import cookiecutter
# from cookiecutter import prompt
# from cookiecutter.cli import main as cc_main

# class NestedQuestion:
# ''' [{'a': {'val1': 'default1', 'val2': 'default2'}}]

# Interprets lists as questions with multiple options, where the
# and dictionaries as single questions with defaults values.
# '''
# @classmethod
# def update_context(cls, context, question_structure):
# qd = question_structure
# if isinstance(qd, list):
# selection = cls.get_user_option(qd)

# name, vals = list(selection.items())[0]

# context[name] = {}
# cls.update_context(context[name], vals)

# elif isinstance(qd, dict):
# for k, v in qd.items():
# context[k]= {}

# if isinstance(v, (dict, list)):
# context[k] = cls.update_context(context[k], v)
# else:
# context[k] = cls.get_user_input(k, v)

# return context

# @staticmethod
# def get_user_input(key, default):
# return prompt.read_user_variable(key, default)
# # return input(f"{key} [{default}]: ") or default

# @staticmethod
# def get_user_option(options):
# prompt.read_user_choice()

# # input_msg = '\n'.join(
# # f" [{ix + 1}] - {list(value.keys())[0]}" for ix, value in enumerate(options)
# # )

# # prepend = 'Select an item:\n'
# # postpend = "\n - Enter number [1]: "

# # ix = int(input(prepend + input_msg + postpend) or 1) - 1
# # return options[ix]
27 changes: 22 additions & 5 deletions cookiecutter.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,27 @@
{
"project_name": "project_name",
"repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
"module_name": "{{ cookiecutter.project_name.lower().replace(' ', '_').replace('-', '_') }}",
"author_name": "Your name (or your organization/company/team)",
"description": "A short description of the project.",
"open_source_license": ["MIT", "BSD-3-Clause", "No license file"],
"s3_bucket": "[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')",
"aws_profile": "default",
"python_interpreter": ["python3", "python"]
}
"python_version_number": "3.7",
"dataset_storage": [
{"none": {}},
{"azure": {"container": "container-name"}},
{"s3": {"bucket": "bucket-name", "aws_profile": "default"}},
{"gcs": {"bucket": "bucket-name"}}
],
"environment_manager" : [
"none",
"conda",
"virtualenv",
"pipenv"
],
"dependency_file": [
"none",
"requirements.txt",
"environment.yml",
"Pipfile"
],
"open_source_license": ["MIT", "BSD-3-Clause", "No license file"]
}
20 changes: 10 additions & 10 deletions docs/docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Disagree with a couple of the default folder names? Working on a project that's

## Getting started

With this in mind, we've created a data science cookiecutter template for projects in Python. Your analysis doesn't have to be in Python, but the template does provide some Python boilerplate that you'd want to remove (in the `src` folder for example, and the Sphinx documentation skeleton in `docs`).
With this in mind, we've created a data science cookiecutter template for projects in Python. Your analysis doesn't have to be in Python, but the template does provide some Python boilerplate that you'd want to remove (in the `{{ cookiecutter.module_name }}` folder for example, and the Sphinx documentation skeleton in `docs`).

### Requirements

Expand All @@ -72,7 +72,7 @@ cookiecutter https://github.com/drivendata/cookiecutter-data-science

### Example

<script type="text/javascript" src="https://asciinema.org/a/9bgl5qh17wlop4xyxu9n9wr02.js" id="asciicast-9bgl5qh17wlop4xyxu9n9wr02" async></script>
<script type="text/javascript" {{ cookiecutter.module_name }}="https://asciinema.org/a/9bgl5qh17wlop4xyxu9n9wr02.js" id="asciicast-9bgl5qh17wlop4xyxu9n9wr02" async></script>
pjbull marked this conversation as resolved.
Show resolved Hide resolved

## Directory structure

Expand Down Expand Up @@ -103,8 +103,8 @@ cookiecutter https://github.com/drivendata/cookiecutter-data-science
│ generated with `pip freeze > requirements.txt`
├── setup.py <- Make this project pip installable with `pip install -e`
├── src <- Source code for use in this project.
│   ├── __init__.py <- Makes src a Python module
├── {{ cookiecutter.module_name }} <- Source code for use in this project.
│   ├── __init__.py <- Makes {{ cookiecutter.module_name }} a Python module
│ │
│   ├── data <- Scripts to download or generate data
│   │   └── make_dataset.py
Expand All @@ -129,7 +129,7 @@ There are some opinions implicit in the project structure that have grown out of

### Data is immutable

Don't ever edit your raw data, especially not manually, and especially not in Excel. Don't overwrite your raw data. Don't save multiple versions of the raw data. Treat the data (and its format) as immutable. The code you write should move the raw data through a pipeline to your final analysis. You shouldn't have to run all of the steps every time you want to make a new figure (see [Analysis is a DAG](#analysis-is-a-dag)), but anyone should be able to reproduce the final products with only the code in `src` and the data in `data/raw`.
Don't ever edit your raw data, especially not manually, and especially not in Excel. Don't overwrite your raw data. Don't save multiple versions of the raw data. Treat the data (and its format) as immutable. The code you write should move the raw data through a pipeline to your final analysis. You shouldn't have to run all of the steps every time you want to make a new figure (see [Analysis is a DAG](#analysis-is-a-dag)), but anyone should be able to reproduce the final products with only the code in `{{ cookiecutter.module_name }}` and the data in `data/raw`.
pjbull marked this conversation as resolved.
Show resolved Hide resolved

Also, if data is immutable, it doesn't need source control in the same way that code does. Therefore, ***by default, the data folder is included in the `.gitignore` file.*** If you have a small amount of data that rarely changes, you may want to include the data in the repository. Github currently warns if files are over 50MB and rejects files over 100MB. Some other options for storing/syncing large data include [AWS S3](https://aws.amazon.com/s3/) with a syncing tool (e.g., [`s3cmd`](http://s3tools.org/s3cmd)), [Git Large File Storage](https://git-lfs.github.com/), [Git Annex](https://git-annex.branchable.com/), and [dat](http://dat-data.com/). Currently by default, we ask for an S3 bucket and use [AWS CLI](http://docs.aws.amazon.com/cli/latest/reference/s3/index.html) to sync data in the `data` folder with the server.

Expand All @@ -141,18 +141,18 @@ Since notebooks are challenging objects for source control (e.g., diffs of the `

1. Follow a naming convention that shows the owner and the order the analysis was done in. We use the format `<step>-<ghuser>-<description>.ipynb` (e.g., `0.3-bull-visualize-distributions.ipynb`).

2. Refactor the good parts. Don't write code to do the same task in multiple notebooks. If it's a data preprocessing task, put it in the pipeline at `src/data/make_dataset.py` and load data from `data/interim`. If it's useful utility code, refactor it to `src`.
2. Refactor the good parts. Don't write code to do the same task in multiple notebooks. If it's a data preprocessing task, put it in the pipeline at `{{ cookiecutter.module_name }}/data/make_dataset.py` and load data from `data/interim`. If it's useful utility code, refactor it to `{{ cookiecutter.module_name }}`.

Now by default we turn the project into a Python package (see the `setup.py` file). You can import your code and use it in notebooks with a cell like the following:

```
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
# OPTIONAL: always reload modules so that as you change code in {{ cookiecutter.module_name }}, it gets loaded
%autoreload 2

from src.data import make_dataset
from {{ cookiecutter.module_name }}.data import make_dataset
```

### Analysis is a DAG
Expand Down Expand Up @@ -192,10 +192,10 @@ OTHER_VARIABLE=something

#### Use a package to load these variables automatically.

If you look at the stub script in `src/data/make_dataset.py`, it uses a package called [python-dotenv](https://github.com/theskumar/python-dotenv) to load up all the entries in this file as environment variables so they are accessible with `os.environ.get`. Here's an example snippet adapted from the `python-dotenv` documentation:
If you look at the stub script in `{{ cookiecutter.module_name }}/data/make_dataset.py`, it uses a package called [python-dotenv](https://github.com/theskumar/python-dotenv) to load up all the entries in this file as environment variables so they are accessible with `os.environ.get`. Here's an example snippet adapted from the `python-dotenv` documentation:

```python
# src/data/dotenv_example.py
# {{ cookiecutter.module_name }}/data/dotenv_example.py
import os
from dotenv import load_dotenv, find_dotenv

Expand Down
66 changes: 66 additions & 0 deletions hooks/post_gen_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os

packages = [
'flake8',
'pathlib2',
'pip',
'setuptools',
'wheel',
]

pip_only_packages = [
'awscli',
'python-dotenv',
]

{% if cookiecutter.dataset_storage.s3 %}
pjbull marked this conversation as resolved.
Show resolved Hide resolved
packages += ['awscli']
{% endif %}

dependencies = '{{ cookiecutter.dependency_file }}'

def write_dependencies():
if dependencies == 'requirements.txt':
with open(dependencies, 'w') as f:
lines = sorted(packages + pip_only_packages)

lines += [
""
"-e ."
]

f.write("\n".join(lines))

elif dependencies == 'environment.yml':
with open(dependencies, 'w') as f:
lines = ["name: {{ cookiecutter.repo_name }}",
"dependencies:"]

lines += [f" - {p}" for p in packages]

lines += [" - pip:"] + [f" - {p}" for p in pip_only_packages]

lines += [' - -e .']

lines += [" - python={{ cookiecutter.python_version_number }}"]

f.write("\n".join(lines))


elif dependencies == 'Pipfile':
with open(dependencies, 'w') as f:
lines = ["[packages]"]
lines += [f'{p} = "*"' for p in sorted(packages + pip_only_packages)]

lines += ['"{{ cookiecutter.module_name }}" = {editable = true, path = "."}']

lines += [
"",
"[requires]",
'python_version = "{{ cookiecutter.python_version_number }}"'
]

f.write("\n".join(lines))


write_dependencies()
Loading