Skip to content

Commit

Permalink
Merge pull request #11 from GSA/develop
Browse files Browse the repository at this point in the history
Merging for 1.25 release
  • Loading branch information
Yatin Khadilkar committed Jan 16, 2015
2 parents 3ee1bd2 + b6154f2 commit 7a8d6d0
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 49 deletions.
52 changes: 15 additions & 37 deletions ckanext/datajson/datajsonvalidator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
import re

# from the iso8601 package, plus ^ and $ on the edges
ISO8601_REGEX = re.compile(r"^([0-9]{4})(-([0-9]{1,2})(-([0-9]{1,2})"
r"((.)([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?"
r"(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?$")

TEMPORAL_REGEX_1 = re.compile(
r'^([\+-]?\d{4}(?!\d{2}\b))((-?)((0[1-9]|1[0-2])(\3([12]\d|0[1-9]|3[01]))?|W([0-4]\d|5[0-2])(-?[1-7])?'
r'|(00[1-9]|0[1-9]\d|[12]\d{2}|3([0-5]\d|6[1-6])))([T\s]((([01]\d|2[0-3])((:?)[0-5]\d)?|24\:?00)([\.,]'
Expand Down Expand Up @@ -107,11 +102,11 @@ def do_validation(doc, errors_array):
dataset_name = "dataset %d" % (i + 1)

# title
if check_string_field(item, "title", 1, dataset_name, errs):
if check_required_string_field(item, "title", 1, dataset_name, errs):
dataset_name = '"%s"' % item.get("title", "").strip()

# accessLevel # required
if check_string_field(item, "accessLevel", 3, dataset_name, errs):
if check_required_string_field(item, "accessLevel", 3, dataset_name, errs):
if item["accessLevel"] not in ("public", "restricted public", "non-public"):
add_error(errs, 5, "Invalid Required Field Value",
"The field 'accessLevel' had an invalid value: \"%s\"" % item["accessLevel"],
Expand All @@ -137,10 +132,10 @@ def do_validation(doc, errors_array):
if check_required_field(item, "contactPoint", dict, dataset_name, errs):
cp = item["contactPoint"]
# contactPoint - fn # required
check_string_field(cp, "fn", 1, dataset_name, errs)
check_required_string_field(cp, "fn", 1, dataset_name, errs)

# contactPoint - hasEmail # required
if check_string_field(cp, "hasEmail", 9, dataset_name, errs):
if check_required_string_field(cp, "hasEmail", 9, dataset_name, errs):
import lepl.apps.rfc3696

email_validator = lepl.apps.rfc3696.Email()
Expand All @@ -151,10 +146,10 @@ def do_validation(doc, errors_array):
dataset_name)

# description # required
check_string_field(item, "description", 1, dataset_name, errs)
check_required_string_field(item, "description", 1, dataset_name, errs)

# identifier #required
if check_string_field(item, "identifier", 1, dataset_name, errs):
if check_required_string_field(item, "identifier", 1, dataset_name, errs):
if item["identifier"] in seen_identifiers:
add_error(errs, 5, "Invalid Required Field Value",
"The dataset identifier \"%s\" is used more than once." % item["identifier"],
Expand All @@ -175,7 +170,7 @@ def do_validation(doc, errors_array):
"A keyword in the keyword array was an empty string.", dataset_name)

# modified # required
if check_string_field(item, "modified", 1, dataset_name, errs):
if check_required_string_field(item, "modified", 1, dataset_name, errs):
if not MODIFIED_REGEX_1.match(item['modified']) \
and not MODIFIED_REGEX_2.match(item['modified']) \
and not MODIFIED_REGEX_3.match(item['modified']):
Expand All @@ -195,7 +190,7 @@ def do_validation(doc, errors_array):
# publisher # required
if check_required_field(item, "publisher", dict, dataset_name, errs):
# publisher - name # required
check_string_field(item["publisher"], "name", 1, dataset_name, errs)
check_required_string_field(item["publisher"], "name", 1, dataset_name, errs)

# Required-If-Applicable

Expand All @@ -222,7 +217,7 @@ def do_validation(doc, errors_array):

# distribution - mediaType # Required-If-Applicable
if 'downloadURL' in dt:
if check_string_field(dt, "mediaType", 1, distribution_name, errs):
if check_required_string_field(dt, "mediaType", 1, distribution_name, errs):
if not IANA_MIME_REGEX.match(dt["mediaType"]):
add_error(errs, 5, "Invalid Field Value",
"The distribution mediaType \"%s\" is invalid. "
Expand All @@ -249,23 +244,23 @@ def do_validation(doc, errors_array):

# distribution - description # optional
if dt.get("description") is not None:
check_string_field(dt, "description", 1, distribution_name, errs)
check_required_string_field(dt, "description", 1, distribution_name, errs)

# distribution - format # optional
if dt.get("format") is not None:
check_string_field(dt, "format", 1, distribution_name, errs)
check_required_string_field(dt, "format", 1, distribution_name, errs)

# distribution - title # optional
if dt.get("title") is not None:
check_string_field(dt, "title", 1, distribution_name, errs)
check_required_string_field(dt, "title", 1, distribution_name, errs)

# license # Required-If-Applicable
check_url_field(False, item, "license", dataset_name, errs)

# rights # Required-If-Applicable
# TODO move to warnings
# if item.get("accessLevel") != "public":
# check_string_field(item, "rights", 1, dataset_name, errs)
# check_string_field(item, "rights", 1, dataset_name, errs)

# spatial # Required-If-Applicable
# TODO: There are more requirements than it be a string.
Expand Down Expand Up @@ -312,7 +307,7 @@ def do_validation(doc, errors_array):

# isPartOf # optional
if item.get("isPartOf"):
check_string_field(item, "isPartOf", 1, dataset_name, errs)
check_required_string_field(item, "isPartOf", 1, dataset_name, errs)

# issued # optional
if item.get("issued") is not None:
Expand Down Expand Up @@ -415,7 +410,7 @@ def check_required_field(obj, field_name, data_type, dataset_name, errs):
return True


def check_string_field(obj, field_name, min_length, dataset_name, errs):
def check_required_string_field(obj, field_name, min_length, dataset_name, errs):
# checks that a required field exists, is typed as a string, and has a minimum length
if not check_required_field(obj, field_name, (str, unicode), dataset_name, errs):
return False
Expand All @@ -431,23 +426,6 @@ def check_string_field(obj, field_name, min_length, dataset_name, errs):
return True


def check_date_field(obj, field_name, dataset_name, errs):
# checks that a required date field exists and looks like a date
if not check_required_field(obj, field_name, (str, unicode), dataset_name, errs):
return False
elif len(obj[field_name].strip()) == 0:
add_error(errs, 10, "Missing Required Fields", "The '%s' field is present but empty." % field_name,
dataset_name)
return False
else:
if not ISO8601_REGEX.match(obj[field_name]):
add_error(errs, 5, "Invalid Required Field Value",
"The '%s' field has an invalid ISO 8601 date or date-time value: \"%s\"." % (
field_name, obj[field_name]), dataset_name)
return False
return True


def check_url_field(required, obj, field_name, dataset_name, errs):
# checks that a required or optional field, if specified, looks like a URL
if not required and (field_name not in obj or obj[field_name] is None): return True # not required, so OK
Expand Down
37 changes: 25 additions & 12 deletions ckanext/datajson/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,15 +217,16 @@ def make_edi(owner_org):
logger.addHandler(eh)

# Build the data.json file.
packages = get_all_group_packages(group_id=owner_org)
packages = get_packages(owner_org)

output = []
for pkg in packages:
if pkg['owner_org'] == owner_org:
datajson_entry = make_datajson_entry(pkg)
if datajson_entry and is_valid(datajson_entry):
output.append(datajson_entry)
else:
logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None))
#if pkg['owner_org'] == owner_org:
datajson_entry = make_datajson_entry(pkg)
if datajson_entry and is_valid(datajson_entry):
output.append(datajson_entry)
else:
logger.warn("Dataset id=[%s], title=[%s] omitted\n", pkg.get('id', None), pkg.get('title', None))

# Get the error log
eh.flush()
Expand All @@ -247,18 +248,15 @@ def make_pdl(owner_org):
eh.setFormatter(formatter)
logger.addHandler(eh)


# Build the data.json file.
packages = get_all_group_packages(group_id=owner_org)
packages = get_packages(owner_org)

output = []
#Create data.json only using public datasets, datasets marked non-public are not exposed
for pkg in packages:
extras = dict([(x['key'], x['value']) for x in pkg['extras']])
try:
if pkg['owner_org'] == owner_org \
and not (re.match(r'[Nn]on-public', extras['public_access_level'])):

if not (re.match(r'[Nn]on-public', extras['public_access_level'])):
datajson_entry = make_datajson_entry(pkg)
if datajson_entry and is_valid(datajson_entry):
output.append(datajson_entry)
Expand All @@ -280,6 +278,21 @@ def make_pdl(owner_org):
#return json.dumps(output)
return write_zip(output, error, zip_name='pdl')

def get_packages(owner_org):
# Build the data.json file.
packages = get_all_group_packages(group_id=owner_org)
#get packages for sub-agencies.
sub_agency = model.Group.get(owner_org)
if 'sub-agencies' in sub_agency.extras.col.target and \
sub_agency.extras.col.target['sub-agencies'].state == 'active':
sub_agencies = sub_agency.extras.col.target['sub-agencies'].value
sub_agencies_list = sub_agencies.split(",")
for sub in sub_agencies_list:
sub_packages = get_all_group_packages(group_id=sub)
for sub_package in sub_packages:
packages.append(sub_package)

return packages

def get_all_group_packages(group_id):
"""
Expand Down

0 comments on commit 7a8d6d0

Please sign in to comment.