From ba81995a16e65b68f929d02d90fcebf14f95ce56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yann=20Vot=C3=A9?= Date: Fri, 13 May 2016 17:21:38 +0200 Subject: [PATCH] Add profiles: use Eurovoc groups and SKOS labels Add a first profile for ckanext-dcat extension which add Eurovoc URIS to a dataset `groups` property. Consequently, if Eurovoc groups exists and are identified by their URIs, harvested datasets will automatically be added to those groups. Also add a second profile which replace SKOS concepts URIs by their labels. That is, instead of harvesting a dataset *theme* as `http://eurovoc.europa.eu/2467`, the theme will be imported as `African orgnisation`. --- ckanext/datalocale/skosprofile.py | 74 +++++++++++++++++++++++++++++++ setup.py | 8 +++- 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 ckanext/datalocale/skosprofile.py diff --git a/ckanext/datalocale/skosprofile.py b/ckanext/datalocale/skosprofile.py new file mode 100644 index 0000000..b8ecae5 --- /dev/null +++ b/ckanext/datalocale/skosprofile.py @@ -0,0 +1,74 @@ +import json + +from rdflib import URIRef +from rdflib.namespace import Namespace, SKOS + +from pylons import config + +from ckanext.dcat.profiles import RDFProfile + + +DCAT = Namespace("http://www.w3.org/ns/dcat#") + + +class LabeledConceptsDCATAPProfile(RDFProfile): + """An RDF profile based on the *actual* DCAT-AP specification. + + In this specification, dataset themes and publisher types are SKOS + concepts. For each of these concepts, this profile will put insert + its label instead of its URI. + + It depends on the European DCAT-AP profile (``euro_dcat_ap``). + """ + + def _replace_concept_uris_by_labels(self, dataset_dict, key): + """Replace in ``dataset_dict[extras]``, for the given key, all + values which are URIs with their corresponding labels.""" + concept_dict = next((d for d in dataset_dict.get('extras', []) + if d['key'] == key), {}) + try: + is_list = True + concept_uris = json.loads(concept_dict.get('value', '[]')) + except ValueError: # Not a list + is_list = False + concept_uris = [concept_dict.get('value', u'')] + for concept_uri in concept_uris[:]: # Copy list to edit original + labels = self.g.preferredLabel( + URIRef(concept_uri), + lang=config.get('ckan.locale_default', 'en') + ) + if labels: + _, label = labels[0] + label = unicode(label) + concept_uris.remove(concept_uri) + concept_uris.append(label) + if not is_list: + concept_uris = concept_uris.pop() + concept_dict['value'] = json.dumps(concept_uris, ensure_ascii=False) + + def parse_dataset(self, dataset_dict, dataset_ref): + self._replace_concept_uris_by_labels(dataset_dict, 'theme') + self._replace_concept_uris_by_labels(dataset_dict, 'publisher_type') + return dataset_dict + + +class EurovocGroupsDCATAPProfile(RDFProfile): + """An RDF profile based on the DCAT-AP specification and that will try to + put datasets into Eurovoc groups. + + Thus, it requires that those groups already exist in CKAN and that they + are identified (``id`` property) by URI (begins with + ``http://eurovoc.europa.eu``). + + This profile also depends on the European DCAT-AP profile + (``euro_dcat_ap``). + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + # Groups (Eurovoc domains) + dataset_dict['groups'] = [] + theme_uris = self._object_value_list(dataset_ref, DCAT.theme) + for theme_uri in theme_uris: + scheme_uri = self._object_value(URIRef(theme_uri), SKOS.inScheme) + dataset_dict['groups'].append({'id': scheme_uri}) + return dataset_dict diff --git a/setup.py b/setup.py index d712c75..2d83d15 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,10 @@ # project is installed. For an analysis of "install_requires" vs pip's # requirements files see: # https://packaging.python.org/en/latest/technical.html#install-requires-vs-requirements-files - install_requires=[], + install_requires=[ + 'ckanext-dcat', + 'rdflib', + ], # If there are data files included in your packages that need to be # installed, specify them here. If using Python 2.6 or less, then these @@ -82,6 +85,9 @@ datalocale=ckanext.datalocale.plugin:DatalocalePlugin [paste.paster_command] datastore-cleanup=ckanext.datalocale.commands:DataStoreCleanup + [ckan.rdf.profiles] + eurovoc_groups_dcat_ap=ckanext.datalocale.skosprofile:EurovocGroupsDCATAPProfile + labeled_concepts_dcat_ap=ckanext.datalocale.skosprofile:LabeledConceptsDCATAPProfile [babel.extractors] ckan = ckan.lib.extract:extract_ckan ''',