diff --git a/.env.example b/.env.example index f006f7e..90d2e25 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,9 @@ PYCSW_PORT=8000 PYCSW_DEV_PORT=5678 +## ckan-pycsw unverified mode (True/False). SSL certificate from host will download if SSL_UNVERIFIED_MODE=True, to avoid SSL error when certificate was self-signed. +SSL_UNVERIFIED_MODE=False + # URLS CKAN_URL=http://localhost:5000/ PYCSW_URL=http://localhost:${PYCSW_PORT}/pycsw/csw.py diff --git a/README.md b/README.md index 01a8fc7..c766c1b 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ ## Overview Docker compose environment (based on [pycsw](https://github.com/geopython/pycsw)) for development and testing with CKAN Open Data portals.[^1] -> [!NOTE] +> [!TIP] > It can be easily tested with a CKAN-type Open Data portal deployment: [mjanez/ckan-docker](https://github.com/mjanez/ckan-docker)[^2]. Available components: @@ -29,15 +29,20 @@ Available components: ### With docker compose Copy the `.env.example` template and configure by changing the `.env` file. Change `PYCSW_URL` and `CKAN_URL`, as well as the published port `PYCSW_PORT`, if needed. - ```shell - cp .env.example .env - ``` +```shell +cp .env.example .env +``` + +Select the CKAN Schema (`PYCSW_CKAN_SCHEMA`), and the pycsw output schema (`PYCSW_OUPUT_SCHEMA`): -Select the CKAN Schema (`PYCSW_CKAN_SCHEMA`), and the pycsw output schema (`PYCSW_OUPUT_SCHEMA`). - Default: ```ini PYCSW_CKAN_SCHEMA=iso19139_geodcatap PYCSW_OUPUT_SCHEMA=iso19139_inspire + + ... + + SSL_UNVERIFIED_MODE=True ``` - Avalaible: * CKAN metadata schema (`PYCSW_CKAN_SCHEMA`): @@ -45,11 +50,17 @@ Select the CKAN Schema (`PYCSW_CKAN_SCHEMA`), and the pycsw output schema (`PYCS * `iso19139_base`: [WIP] Base schema. * pycsw metadata schema (`PYCSW_OUPUT_SCHEMA`): - * `iso19139_inspire`, **default**: Customised schema based on ISO 19139 INSPIRE metadata schema. + * `iso19139_inspire`, **default**: Customised schema based on ISO 19139 INSPIRE metadata schema. [^4] * `iso19139`: Standard pycsw schema based on ISO 19139. -> [!NOTE] -> The output pycsw schema (`iso19139_inspire`), to comply with INSPIRE ISO 19139 is WIP. The validation of the dataset/series is complete and conforms to the [INSPIRE reference validator](https://inspire.ec.europa.eu/validator/home/index.html) datasets and dataset series (Conformance Class 1, 2, 2b and 2c). In contrast, spatial data services still fail in only 1 dimension [WIP]. +Change `SSL_UNVERIFIED_MODE` to avoid SSL errors when using a self-signed certificate in CKAN `development`. + +- Default: + ```ini + SSL_UNVERIFIED_MODE=True + ``` +> [!WARNING] +> Enabling `SSL_UNVERIFIED_MODE` can expose your application to security risks by allowing unverified SSL certificates. Use this setting only in a trusted development environment and never in production. To deploy the environment, `docker compose` will build the latest source in the repo. @@ -68,15 +79,13 @@ docker compose -f docker-compose.ghcr.yml --build docker compose up -d --build ``` -> [!NOTE] +> [!TIP] > Deploy the dev (multistage build) `docker-compose.dev.yml` with: > >```bash > docker compose -f docker-compose.dev.yml up --build >``` - - -> [!NOTE] +> >If needed, to build a specific container simply run: > >```bash @@ -277,3 +286,4 @@ List of *containers*: [^1]: Extends the @frafra [coat2pycsw](https://github.com/COATnor/coat2pycsw) package. [^2]: A custom installation of Docker Compose with specific extensions for spatial data and [GeoDCAT-AP](https://github.com/SEMICeu/GeoDCAT-AP)/[INSPIRE](https://github.com/INSPIRE-MIF/technical-guidelines) metadata [profiles](https://en.wikipedia.org/wiki/Geospatial_metadata). [^3]: [INSPIRE dataset and service metadata](https://inspire.ec.europa.eu/id/document/tg/metadata-iso19139) based on ISO/TS 19139:2007. +[^4]: The output pycsw schema (`iso19139_inspire`), to comply with INSPIRE ISO 19139 is WIP. The validation of the dataset/series is complete and conforms to the [INSPIRE reference validator](https://inspire.ec.europa.eu/validator/home/index.html) datasets and dataset series (Conformance Class 1, 2, 2b and 2c). In contrast, spatial data services still fail in only 1 dimension [WIP]. diff --git a/ckan-pycsw/Dockerfile b/ckan-pycsw/Dockerfile index c19dbc7..63a36a9 100644 --- a/ckan-pycsw/Dockerfile +++ b/ckan-pycsw/Dockerfile @@ -5,7 +5,9 @@ LABEL maintainer="mnl.janez@gmail.com" ENV USERNAME=ckan-pycsw ENV USER_UID=10001 ENV USER_GID=$USER_UID + # ckan-pycsw envvars +ENV CKAN_PYCSW_VERSION=1.0.0 ENV APP_DIR=/app ENV TZ=UTC RUN echo ${TZ} > /etc/timezone @@ -18,20 +20,45 @@ ENV DEV_MODE=False ENV TIMEOUT=300 ENV PYCSW_CRON_DAYS_INTERVAL=2 ENV PYCSW_CRON_HOUR_START=4 +ENV SSL_UNVERIFIED_MODE=False + +# PYCSW Catalog configuration +ENV CSW_IDENTIFICATION_TITLE="Sample Geospatial Catalogue" \ + CSW_IDENTIFICATION_ABSTRACT="OGC CSW server powered by pycsw" \ + CSW_PROVIDER_NAME="ckan-docker development team" \ + CSW_PROVIDER_URL="https://github.com/mjanez/ckan-docker" \ + CSW_CONTACT_NAME="ckan-docker development team" \ + CSW_CONTACT_POSITION="Site Administrator" \ + CSW_CONTACT_ADDRESS="ckan-docker development team" \ + CSW_CONTACT_CITY="Madrid" \ + CSW_CONTACT_STATE_OR_PROVINCE="Madrid" \ + CSW_CONTACT_POSTAL_CODE="28001" \ + CSW_CONTACT_COUNTRY="Spain" \ + CSW_CONTACT_EMAIL=${CKAN_SYSADMIN_EMAIL} \ + CSW_CONTACT_URL=${CKAN_URL} \ + CSW_INSPIRE_DATE="2024-01-01" \ + CSW_INSPIRE_GEMET_KEYWORDS="Utility and governmental services" \ + CSW_INSPIRE_CONFORMITY="notEvaluated" \ + CSW_INSPIRE_CONTACT_NAME=${CSW_CONTACT_NAME} \ + CSW_INSPIRE_CONTACT_EMAIL=${CKAN_SYSADMIN_EMAIL} \ + CSW_INSPIRE_TEMP_EXTENT="2024-01-01/2024-12-31" WORKDIR ${APP_DIR} # Create a new non-root user and group, install necessary packages and remove the package list cache RUN groupadd --gid $USER_GID $USERNAME \ - && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \ - chown -R $USERNAME:$USERNAME $APP_DIR && \ - apt-get -q -y update && apt-get install -y --no-install-recommends \ + && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ + && chown -R $USERNAME:$USERNAME $APP_DIR \ + && apt-get -q -y update \ + && apt-get install -y --no-install-recommends \ wget=1.21-1+deb11u1 \ - gettext-base=0.21-4 && \ - wget --progress=dot:mega -O /wait-for https://raw.githubusercontent.com/eficode/wait-for/v2.2.4/wait-for && \ - chmod +x /wait-for && \ - python3 -m pip install --no-cache-dir pdm==2.9.2 && \ - rm -rf /var/lib/apt/lists/* + gettext-base=0.21-4 \ + curl \ + # Remove wait-for + # wget --progress=dot:mega -O /wait-for https://raw.githubusercontent.com/eficode/wait-for/v2.2.4/wait-for \ + # && chmod +x /wait-for \ + && python3 -m pip install --no-cache-dir pdm==2.9.2 \ + && rm -rf /var/lib/apt/lists/* # Install python dependencies with pdm COPY pyproject.toml pdm.lock ./ diff --git a/ckan-pycsw/Dockerfile.dev b/ckan-pycsw/Dockerfile.dev index 76a589d..b560f60 100644 --- a/ckan-pycsw/Dockerfile.dev +++ b/ckan-pycsw/Dockerfile.dev @@ -3,6 +3,7 @@ FROM ghcr.io/mjanez/ckan-pycsw:latest AS base LABEL maintainer="mnl.janez@gmail.com" # ckan-pycsw envvars +ENV CKAN_PYCSW_VERSION=1.0.0 ENV APP_DIR=/app ENV TZ=UTC ENV PYCSW_CKAN_SCHEMA=iso19139_inspire @@ -13,6 +14,7 @@ ENV PYCSW_URL=http://localhost:${PYCSW_PORT}/ ENV DEV_MODE=True ENV PYCSW_DEV_PORT=5678 ENV TIMEOUT=300 +ENV SSL_UNVERIFIED_MODE=False WORKDIR ${APP_DIR} diff --git a/ckan-pycsw/Dockerfile.ghcr b/ckan-pycsw/Dockerfile.ghcr index 3d4ed77..674765a 100644 --- a/ckan-pycsw/Dockerfile.ghcr +++ b/ckan-pycsw/Dockerfile.ghcr @@ -3,6 +3,7 @@ FROM ghcr.io/mjanez/ckan-pycsw:main AS base LABEL maintainer="mnl.janez@gmail.com" # ckan-pycsw envvars +ENV CKAN_PYCSW_VERSION=1.0.0 ENV APP_DIR=/app ENV TZ=UTC ENV PYCSW_CKAN_SCHEMA=iso19139_inspire @@ -14,6 +15,7 @@ ENV DEV_MODE=False ENV TIMEOUT=300 ENV PYCSW_CRON_DAYS_INTERVAL=2 ENV PYCSW_CRON_HOUR_START=4 +ENV SSL_UNVERIFIED_MODE=False WORKDIR ${APP_DIR} diff --git a/ckan-pycsw/conf/pycsw.conf.template b/ckan-pycsw/conf/pycsw.conf.template index da52d7e..f6cf9a0 100644 --- a/ckan-pycsw/conf/pycsw.conf.template +++ b/ckan-pycsw/conf/pycsw.conf.template @@ -9,7 +9,8 @@ maxrecords=10 #logfile=/tmp/pycsw.log #ogc_schemas_base=http://foo #federatedcatalogues=http://catalog.data.gov/csw -#pretty_print=true +pretty_print=true +# Disable gzip_compresslevel when use httpd #gzip_compresslevel=9 #domainquerytype=range #domaincounts=true @@ -22,27 +23,27 @@ allowed_ips=127.0.0.1 #csw_harvest_pagesize=10 [metadata:main] -identification_title=Sample Geospatial Catalogue -identification_abstract=OGC CSW server powered by pycsw -identification_keywords=catalogue,discovery,metadata -#identification_keywords_type=theme +identification_title=${CSW_IDENTIFICATION_TITLE} +identification_abstract=${CSW_IDENTIFICATION_ABSTRACT} +identification_keywords=catalogue,discovery,metadata,spatial +identification_keywords_type=theme identification_fees=None identification_accessconstraints=None -provider_name=Sample -provider_url=https://example.org/ -#contact_name=admin -#contact_position=Position Title -#contact_address=Mailing Address -#contact_city=Madrid -#contact_stateorprovince=Administrative Area -#contact_postalcode=Zip or Postal Code -contact_country=Spain +provider_name=${CSW_PROVIDER_NAME} +provider_url=${CSW_PROVIDER_URL} +contact_name=${CSW_CONTACT_NAME} +contact_position=${CSW_CONTACT_POSITION} +contact_address=${CSW_CONTACT_ADDRESS} +contact_city=${CSW_CONTACT_CITY} +contact_stateorprovince=${CSW_CONTACT_STATE_OR_PROVINCE} +contact_postalcode=${CSW_CONTACT_POSTAL_CODE} +contact_country=${CSW_CONTACT_COUNTRY} #contact_phone=+xx-xxx-xxx-xxxx #contact_fax=+xx-xxx-xxx-xxxx -contact_email=admin@example.org -#contact_url=https://example.org/members/admin -#contact_hours=Hours of Service -#contact_instructions=During hours of service. Off on weekends. +contact_email=${CSW_CONTACT_EMAIL} +contact_url=${CSW_CONTACT_URL} +contact_hours=Hours of Service +contact_instructions=During hours of service. Off on weekends. contact_role=pointOfContact [repository] @@ -54,15 +55,15 @@ database=sqlite:///${PWD}/cite.db #database=mysql://username:password@localhost/pycsw?charset=utf8 #mappings=path/to/mappings.py table=records -#filter=type = 'http://purl.org/dc/dcmitype/Dataset' +#filter=type='http://purl.org/dc/dcmitype/Dataset' [metadata:inspire] enabled=true languages_supported=eng,spa default_language=eng -date=YYYY-MM-DD -gemet_keywords=Utility and governmental services -conformity_service=notEvaluated -contact_name=Organization Name -contact_email=Email Address -temp_extent=YYYY-MM-DD/YYYY-MM-DD \ No newline at end of file +date=${CSW_INSPIRE_DATE} +gemet_keywords=${CSW_INSPIRE_GEMET_KEYWORDS} +conformity_service=${CSW_INSPIRE_CONFORMITY} +contact_name=${CSW_INSPIRE_CONTACT_NAME} +contact_email=${CSW_INSPIRE_CONTACT_EMAIL} +temp_extent=${CSW_INSPIRE_TEMP_EXTENT} \ No newline at end of file diff --git a/ckan-pycsw/docker-entrypoint.d/entrypoint.sh b/ckan-pycsw/docker-entrypoint.d/entrypoint.sh index 9e24c2e..bc6be43 100644 --- a/ckan-pycsw/docker-entrypoint.d/entrypoint.sh +++ b/ckan-pycsw/docker-entrypoint.d/entrypoint.sh @@ -4,7 +4,27 @@ set -xeuo pipefail envsubst < pycsw.conf.template > pycsw.conf -#TODO: -Xfrozen_modules=off from: https://bugs.python.org/issue1666807 -/wait-for --timeout "$TIMEOUT" "$CKAN_URL" -- pdm run python3 -Xfrozen_modules=off ckan2pycsw/ckan2pycsw.py +# TODO: -Xfrozen_modules=off from: https://bugs.python.org/issue1666807 + +# Check if SSL_UNVERIFIED_MODE is enabled +if [ "${SSL_UNVERIFIED_MODE:-false}" = "true" ] || [ "${SSL_UNVERIFIED_MODE:-false}" = "True" ]; then + export REQUESTS_CA_BUNDLE="" + export CURL_CA_BUNDLE="" + SSL_FLAGS="--insecure" # Add SSL ignore flag + echo "[INSECURE] SSL_UNVERIFIED_MODE is enabled. SSL certificate verification is disabled." +else + SSL_FLAGS="" +fi + +# Use curl directly instead of wait-for if necessary +echo 'Waiting for $CKAN_URL to become available...' +until curl $SSL_FLAGS --output /dev/null --silent --head --fail "$CKAN_URL"; do + printf '.' + sleep 5 +done +echo 'CKAN is available.' + +# Ejecutar el comando Python +pdm run python3 -Xfrozen_modules=off ckan2pycsw/ckan2pycsw.py exec "$@" diff --git a/ckan-pycsw/docker-entrypoint.d/entrypoint_dev.sh b/ckan-pycsw/docker-entrypoint.d/entrypoint_dev.sh index e663fb2..fc3931b 100644 --- a/ckan-pycsw/docker-entrypoint.d/entrypoint_dev.sh +++ b/ckan-pycsw/docker-entrypoint.d/entrypoint_dev.sh @@ -4,6 +4,25 @@ set -xeuo pipefail envsubst < pycsw.conf.template > pycsw.conf -/wait-for --timeout "$TIMEOUT" "$CKAN_URL" -- pdm run python3 -m ptvsd --host 0.0.0.0 --port "$PYCSW_DEV_PORT" --wait ckan2pycsw/ckan2pycsw.py +# Check if SSL_UNVERIFIED_MODE is enabled +if [ "${SSL_UNVERIFIED_MODE:-false}" = "true" ] || [ "${SSL_UNVERIFIED_MODE:-false}" = "True" ]; then + export REQUESTS_CA_BUNDLE="" + export CURL_CA_BUNDLE="" + SSL_FLAGS="--insecure" # Add SSL ignore flag + echo "[INSECURE] SSL_UNVERIFIED_MODE is enabled. SSL certificate verification is disabled." +else + SSL_FLAGS="" +fi + +# Use curl directly instead of wait-for if necessary +echo 'Waiting for $CKAN_URL to become available...' +until curl $SSL_FLAGS --output /dev/null --silent --head --fail "$CKAN_URL"; do + printf '.' + sleep 5 +done +echo 'CKAN is available.' + +# Execute Python command with debugging +pdm run python3 -m ptvsd --host 0.0.0.0 --port "$PYCSW_DEV_PORT" --wait ckan2pycsw/ckan2pycsw.py exec "$@" diff --git a/ckan2pycsw/ckan2pycsw.py b/ckan2pycsw/ckan2pycsw.py index 72039ec..3316c7d 100644 --- a/ckan2pycsw/ckan2pycsw.py +++ b/ckan2pycsw/ckan2pycsw.py @@ -38,6 +38,7 @@ except (KeyError, ValueError): PYCSW_CRON_HOUR_START = 4 method = "nightly" +CKAN_PYCSW_VERSION = os.environ.get("CKAN_PYCSW_VERSION", "1.0.0") URL = os.environ.get("CKAN_URL", 'http://localhost:5000/') PYCSW_PORT = os.environ.get("PYCSW_PORT", 8000) PYCSW_URL = os.environ.get("PYCSW_URL", f'http://localhost:{PYCSW_PORT}/') @@ -54,6 +55,7 @@ "iso19139_inspire": ISO19139_inspireOutputSchema, "iso19139": ISO19139OutputSchema } +SSL_UNVERIFIED_MODE = os.environ.get("SSL_UNVERIFIED_MODE", False) def get_datasets(base_url): @@ -75,13 +77,17 @@ def get_datasets(base_url): try: if not base_url.endswith("/"): base_url += "/" + + if SSL_UNVERIFIED_MODE == True or SSL_UNVERIFIED_MODE == "True": + logging.warning(f"[INSECURE] SSL_UNVERIFIED_MODE:'{SSL_UNVERIFIED_MODE}'. Only if you trust the CKAN_URL: {base_url}.") + package_search = urljoin(base_url, "api/3/action/package_search") - res = requests.get(package_search, params={"rows": 0}) + res = requests.get(package_search, params={"rows": 0}, verify=not SSL_UNVERIFIED_MODE) res.raise_for_status() # Raises a HTTPError if the response is not 200 end = res.json().get("result", {}).get("count", 0) rows = 10 for start in range(0, end, rows): - res = requests.get(package_search, params={"start": start, "rows": rows}) + res = requests.get(package_search, params={"start": start, "rows": rows}, verify=not SSL_UNVERIFIED_MODE) res.raise_for_status() # Check response status try: datasets = res.json()["result"]["results"] @@ -117,7 +123,7 @@ def main(): None """ log_file(APP_DIR + "/log") - logging.info(f"{log_module}:ckan2pycsw | Version: 0.1") + logging.info(f"{log_module}:ckan2pycsw | Version: {CKAN_PYCSW_VERSION}") pycsw_config = ConfigParser() pycsw_config.read_file(open(PYCSW_CONF)) database_raw = pycsw_config.get("repository", "database") diff --git a/ckan2pycsw/model/template.py b/ckan2pycsw/model/template.py index c930470..e74ca8a 100644 --- a/ckan2pycsw/model/template.py +++ b/ckan2pycsw/model/template.py @@ -21,6 +21,7 @@ log_module = "[template]" APP_DIR = os.environ["APP_DIR"] +LOG_DIR = APP_DIR + "/log" LOGGER = logging.getLogger(__name__) SCHEMAS_CKAN = pathlib.Path(__file__).resolve().parent.parent / 'schemas/ckan' SCHEMAS_PYGEOMETA = pathlib.Path(__file__).resolve().parent.parent / 'schemas/pygeometa'