usegalaxy-eu · cgirardot · Dec 24, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -36,7 +36,7 @@ jobs:
         echo "password: ${{ secrets.ENA_PASSWORD }}" >> .secrets.yml
     - name: Test submission in --draft mode
       run: |
-        ena-upload-cli --action add --draft --dev --center ${{ secrets.ENA_CENTER }} --data example_data/ENA_TEST1.R1.fastq.gz example_data/ENA_TEST2.R1.fastq.gz example_data/ENA_TEST2.R2.fastq.gz --checklist ERC000033 --secret .secret.yml --xlsx example_tables/ENA_excel_example_ERC000033.xlsx 
+        ena-upload-cli --action add --draft --dev --center TEST --data example_data/ENA_TEST1.R1.fastq.gz example_data/ENA_TEST2.R1.fastq.gz example_data/ENA_TEST2.R2.fastq.gz --checklist ERC000033 --secret .secret.yml --xlsx example_tables/ENA_excel_example_ERC000033.xlsx 
     - name: Run Python to get temp directory
       run: |
         echo "TEMP_DIR=$(python -c 'import tempfile; print(tempfile.gettempdir())')" >> $GITHUB_ENV

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@
 build/
 ena_upload_cli.egg-info/
 __pycache__/
+tests/ena_upload/
+.idea/
diff --git a/README.md b/README.md
@@ -108,6 +108,14 @@ The command line tool will automatically fetch the correct scientific name based
 | sample_alias_4 | sample_title_2 | 2697049  | Severe acute respiratory syndrome coronavirus 2 | covid-19    | sample_description_1 | 2020-10-11      | Argentina                                |
 | sample_alias_5 | sample_title_3 | 2697049  | Severe acute respiratory syndrome coronavirus 2 | covid-19    | sample_description_2 | 2008-01-24      | Belgium                                  |
 
+#### Custom attributes
+Additional custom attributes (i.e. attributes not specified in the ERC checklist) can be added to the sample table by adding columns which headers are named like `sample_attribute[attribute_name]`; for example `sample_attribute[treatment]`, `sample_attribute[age]`... 
+
+| alias          | ...            | sample_attribute[treatment] | sample_attribute[age]
+|----------------|----------------|---------------------|------------------------|
+| sample_alias_4 | ...            | treated             | 2 days
+| sample_alias_5 | ...            | untreated           | 2 days
+
 #### Viral submissions
 
 If you want to submit viral samples you can use the [ENA virus pathogen](https://www.ebi.ac.uk/ena/browser/view/ERC000033) checklist by adding `ERC000033` to the checklist parameter. Check out our [viral example command](#test-the-tool) as demonstration. Please use the [ENA virus pathogen](https://github.com/ELIXIR-Belgium/ENA-metadata-templates/tree/main/templates/ERC000033) checklist in our template repo to know what is allowed/possible in the `Controlled vocabulary`fields.
@@ -116,6 +124,8 @@ If you want to submit viral samples you can use the [ENA virus pathogen](https:/
 
 Please check out the [template](https://github.com/ELIXIR-Belgium/ENA-metadata-templates) of your checklist to discover which attributes are mandatory for the study, experiment and run ENA object.
 
+#### Study and Experiment custom attributes
+Similarly to samples, additional custom attributes can be added to the experiment and study tables by adding columns which headers are named like `experiment_attribute[attribute_name]` and `study_attribute[attribute_name]` in the experiment and study tables, respectively.
 
 ### Dev instance
 

diff --git a/ena_upload/ena_upload.py b/ena_upload/ena_upload.py
@@ -214,13 +214,28 @@ def generate_stream(schema, targets, Template, center, tool):
     :return: stream
     '''
 
+    # find all columns in targets which column header matches the pattern attribute[(.*)], extract the group
+    # and return a dict[header] = group
+    # eg for header run_attribute[sex] => {'run_attribute[sex]': 'sex'}
+    pattern = re.compile(rf"{schema}_attribute\[(.*)\]")
+    extra_attributes = {}
+    for column in targets.columns:
+        match = re.match(pattern, column)
+        if match:
+            extra_attributes[column] = match.group(1)
+
     if schema == 'run':
         # These attributes are required for rendering
         # the run xml templates
         # Adding backwards compatibility for file_format
         if 'file_format' in targets:
             targets.rename(columns={'file_format': 'file_type'}, inplace=True)
         file_attrib = ['file_name', 'file_type', 'file_checksum']
+        if 'read_type' in targets:
+            file_attrib.append('read_type')
+        if 'read_label' in targets:
+            file_attrib.append('read_label')
+
         other_attrib = ['alias', 'experiment_alias']
         # Create groups with alias as index
         run_groups = targets[other_attrib].groupby('alias')['experiment_alias'].first().to_dict()
@@ -230,11 +245,14 @@ def generate_stream(schema, targets, Template, center, tool):
         stream = Template.generate(run_groups=run_groups,
                                    file_groups=file_groups,
                                    center=center,
+                                   extra_attributes=extra_attributes,
                                    tool_name=tool['tool_name'],
                                    tool_version=tool['tool_version'])
     else:
         stream = Template.generate(
-            df=targets, center=center, tool_name=tool['tool_name'], tool_version=tool['tool_version'])
+            df=targets, center=center, extra_attributes=extra_attributes,
+            tool_name=tool['tool_name'], tool_version=tool['tool_version']
+        )
 
     return stream
 
@@ -982,7 +1000,7 @@ def main():
                 if pd.notna(row['scientific_name']) and pd.isna(row['taxon_id']):
                     # retrieve taxon id using scientific name
                     taxonID = get_taxon_id(row['scientific_name'])
-                    df.loc[index, 'taxon_id'] = taxonID
+                    df.loc[index, 'taxon_id'] = int(taxonID)
                 elif pd.notna(row['taxon_id']) and pd.isna(row['scientific_name']):
                     # retrieve scientific name using taxon id
                     scientificName = get_scientific_name(row['taxon_id'])

diff --git a/ena_upload/templates/ENA_template_READ_TYPE.xml b/ena_upload/templates/ENA_template_READ_TYPE.xml
@@ -0,0 +1,9 @@
+<py:choose xmlns:py="http://genshi.edgewall.org/" test="">
+<READ_TYPE py:when="rtype.strip().lower() == 'single'">single</READ_TYPE>
+<READ_TYPE py:when="rtype.strip().lower() == 'paired'">paired</READ_TYPE>
+<READ_TYPE py:when="rtype.strip().lower() == 'cell_barcode'">cell_barcode</READ_TYPE>
+<READ_TYPE py:when="rtype.strip().lower() == 'umi_barcode'">umi_barcode</READ_TYPE>
+<READ_TYPE py:when="rtype.strip().lower() == 'feature_barcode'">feature_barcode</READ_TYPE>
+<READ_TYPE py:when="rtype.strip().lower() == 'sample_barcode'">sample_barcode</READ_TYPE>
+<READ_TYPE py:when="rtype.strip().lower() == 'spatial_barcode'">spatial_barcode</READ_TYPE>
+</py:choose>
diff --git a/ena_upload/templates/ENA_template_experiments.xml b/ena_upload/templates/ENA_template_experiments.xml
@@ -17,62 +17,70 @@ def mandatorytest(row, column, index):
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_6/SRA.experiment.xsd">
     <py:for each="index, row in df.iterrows()">
-        <EXPERIMENT alias="${row.alias}" center_name="${center}">
-            <py:if test="mandatorytest(row, 'title', index)">
-            <TITLE>${row.title}</TITLE>
+    <EXPERIMENT alias="${row.alias}" center_name="${center}">
+        <py:if test="mandatorytest(row, 'title', index)">
+        <TITLE>${row.title}</TITLE>
+        </py:if>
+        <py:if test="mandatorytest(row, 'study_alias', index)">
+        <STUDY_REF refname="${row.study_alias}"/>
+        </py:if>
+        <DESIGN>
+            <py:if test="mandatorytest(row, 'design_description', index)">
+            <DESIGN_DESCRIPTION>${row.design_description}</DESIGN_DESCRIPTION>
             </py:if>
-            <py:if test="mandatorytest(row, 'study_alias', index)">
-            <STUDY_REF refname="${row.study_alias}"/>
+            <py:if test="attributetest(row, 'spot_descriptor')">
+            <SPOT_DESCRIPTOR>${row.spot_descriptor}</SPOT_DESCRIPTOR>
             </py:if>
-            <DESIGN>
-                <py:if test="mandatorytest(row, 'design_description', index)">
-                <DESIGN_DESCRIPTION>${row.design_description}</DESIGN_DESCRIPTION>
+            <py:if test="mandatorytest(row, 'sample_alias', index)">
+            <SAMPLE_DESCRIPTOR refname="${row.sample_alias}"/>
+            </py:if>
+            <LIBRARY_DESCRIPTOR>
+                <py:if test="attributetest(row, 'library_name')">
+                <LIBRARY_NAME>${row.library_name}</LIBRARY_NAME>
                 </py:if>
-                <py:if test="attributetest(row, 'spot_descriptor')">
-                <SPOT_DESCRIPTOR>${row.spot_descriptor}</SPOT_DESCRIPTOR>
+                <py:if test="mandatorytest(row, 'library_strategy', index)">
+                <xi:include href="ENA_template_LIBRARY_STRATEGY.xml" />
                 </py:if>
-                <py:if test="mandatorytest(row, 'sample_alias', index)">
-                <SAMPLE_DESCRIPTOR refname="${row.sample_alias}"/>
+                <py:if test="mandatorytest(row, 'library_source', index)">
+                <xi:include href="ENA_template_LIBRARY_SOURCE.xml" />
                 </py:if>
-                <LIBRARY_DESCRIPTOR>
-                    <py:if test="attributetest(row, 'library_name')">
-                    <LIBRARY_NAME>${row.library_name}</LIBRARY_NAME>
-                    </py:if>
-                    <py:if test="mandatorytest(row, 'library_strategy', index)">
-                    <xi:include href="ENA_template_LIBRARY_STRATEGY.xml" />
-                    </py:if>
-                    <py:if test="mandatorytest(row, 'library_source', index)">
-                    <xi:include href="ENA_template_LIBRARY_SOURCE.xml" />
-                    </py:if>
-                    <py:if test="mandatorytest(row, 'library_selection', index)">
-                    <xi:include href="ENA_template_LIBRARY_SELECTION.xml" />
-                    </py:if>
-                    <py:if test="mandatorytest(row, 'library_layout', index)">
-                    <LIBRARY_LAYOUT py:choose="">
-                        <PAIRED py:when="row.library_layout.lower().strip() == 'paired'" NOMINAL_LENGTH="${int(row.insert_size)}" />
-                        <SINGLE py:when="row.library_layout.lower().strip() == 'single'" />
-                    </LIBRARY_LAYOUT>
-                    </py:if>
-                    <py:if test="attributetest(row, 'library_construction_protocol')">
-                    <LIBRARY_CONSTRUCTION_PROTOCOL>${row.library_construction_protocol}</LIBRARY_CONSTRUCTION_PROTOCOL>
-                    </py:if>
-                </LIBRARY_DESCRIPTOR>
-            </DESIGN>
-            <py:if test="mandatorytest(row, 'platform', index)">
-            <py:if test="mandatorytest(row, 'instrument_model', index)">
-            <xi:include href="ENA_template_PLATFORM.xml" />
-            </py:if>
+                <py:if test="mandatorytest(row, 'library_selection', index)">
+                <xi:include href="ENA_template_LIBRARY_SELECTION.xml" />
+                </py:if>
+                <py:if test="mandatorytest(row, 'library_layout', index)">
+                <LIBRARY_LAYOUT py:choose="">
+                    <PAIRED py:when="row.library_layout.lower().strip() == 'paired'" NOMINAL_LENGTH="${int(row.insert_size)}" />
+                    <SINGLE py:when="row.library_layout.lower().strip() == 'single'" />
+                </LIBRARY_LAYOUT>
+                </py:if>
+                <py:if test="attributetest(row, 'library_construction_protocol')">
+                <LIBRARY_CONSTRUCTION_PROTOCOL>${row.library_construction_protocol}</LIBRARY_CONSTRUCTION_PROTOCOL>
+                </py:if>
+            </LIBRARY_DESCRIPTOR>
+        </DESIGN>
+        <py:if test="mandatorytest(row, 'platform', index)">
+        <py:if test="mandatorytest(row, 'instrument_model', index)">
+        <xi:include href="ENA_template_PLATFORM.xml" />
+        </py:if>
+        </py:if>
+        <EXPERIMENT_ATTRIBUTES>
+            <py:for each="header, tag in extra_attributes.items()">
+            <py:if test="attributetest(row, header)">
+            <EXPERIMENT_ATTRIBUTE>
+                <TAG>${tag}</TAG>
+                <VALUE>${row[header]}</VALUE>
+            </EXPERIMENT_ATTRIBUTE>
             </py:if>
-            <EXPERIMENT_ATTRIBUTES>
-                <EXPERIMENT_ATTRIBUTE>
-                    <TAG>SUBMISSION_TOOL</TAG>
-                    <VALUE>${tool_name}</VALUE>
-                </EXPERIMENT_ATTRIBUTE>
-                <EXPERIMENT_ATTRIBUTE>
-                    <TAG>SUBMISSION_TOOL_VERSION</TAG>
-                    <VALUE>${tool_version}</VALUE>
-                </EXPERIMENT_ATTRIBUTE>
-            </EXPERIMENT_ATTRIBUTES>
-        </EXPERIMENT>
+            </py:for>
+            <EXPERIMENT_ATTRIBUTE>
+                <TAG>SUBMISSION_TOOL</TAG>
+                <VALUE>${tool_name}</VALUE>
+            </EXPERIMENT_ATTRIBUTE>
+            <EXPERIMENT_ATTRIBUTE>
+                <TAG>SUBMISSION_TOOL_VERSION</TAG>
+                <VALUE>${tool_version}</VALUE>
+            </EXPERIMENT_ATTRIBUTE>
+        </EXPERIMENT_ATTRIBUTES>
+    </EXPERIMENT>
     </py:for>
 </EXPERIMENT_SET>
diff --git a/ena_upload/templates/ENA_template_runs.xml b/ena_upload/templates/ENA_template_runs.xml
@@ -2,6 +2,9 @@
 <?python
 import pandas as pd
 import sys
+def attributetest(row, column):
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
+        return True
 def mandatorytest(row, column, index):
     if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
         return True 
@@ -20,12 +23,38 @@ def mandatorytest(row, column, index):
             <FILES>
                 <py:for each="index, row in file_groups.get_group(alias).iterrows()">
                 <py:if test="mandatorytest(row, 'file_type', index)">
-                <xi:include href="ENA_template_FILE.xml" />
+                <py:choose xmlns:py="http://genshi.edgewall.org/" test="">
+                <py:when test="row.file_type.lower().strip() == 'fastq'">
+                    <FILE filename="${row.file_name}" filetype="fastq" checksum_method="MD5" checksum="${row.file_checksum}">
+                        <py:if test="attributetest(row, 'read_label')">
+                        <py:for each="rlabel in row.read_label.split(',')">
+                        <READ_LABEL>${rlabel.strip()}</READ_LABEL>
+                        </py:for>
+                        </py:if>
+                        <py:if test="attributetest(row, 'read_type')">
+                        <py:for each="rtype in row.read_type.split(',')">
+                        <xi:include href="ENA_template_READ_TYPE.xml" />
+                        </py:for>
+                        </py:if>
+                    </FILE>
+                    </py:when>
+                    <py:otherwise>
+                        <xi:include href="ENA_template_FILE.xml" />
+                    </py:otherwise>
+                </py:choose>
                 </py:if>
                 </py:for>
             </FILES>
         </DATA_BLOCK>
         <RUN_ATTRIBUTES>
+            <py:for each="header, tag in extra_attributes.items()">
+                <py:if test="attributetest(row, header)">
+                <RUN_ATTRIBUTE>
+                    <TAG>${tag}</TAG>
+                    <VALUE>${row[header]}</VALUE>
+                </RUN_ATTRIBUTE>
+                </py:if>
+            </py:for>
             <RUN_ATTRIBUTE>
                 <TAG>SUBMISSION_TOOL</TAG>
                 <VALUE>${tool_name}</VALUE>

diff --git a/ena_upload/templates/ENA_template_samples_ERC000011.xml b/ena_upload/templates/ENA_template_samples_ERC000011.xml
@@ -87,12 +87,6 @@ def mandatorytest(row, column, index):
                 <VALUE>${row['collection date']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
-            <py:if test="mandatorytest(row, 'geographic location (country and/or sea)', index)">
-            <SAMPLE_ATTRIBUTE>
-                <TAG>geographic location (country and/or sea)</TAG>
-                <VALUE>${row['geographic location (country and/or sea)']}</VALUE>
-            </SAMPLE_ATTRIBUTE>
-            </py:if>
             <py:if test="attributetest(row, 'geographic location (region and locality)')">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (region and locality)</TAG>
@@ -123,6 +117,12 @@ def mandatorytest(row, column, index):
                 <VALUE>${row['sex']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
+            <py:if test="mandatorytest(row, 'geographic location (country and/or sea)', index)">
+            <SAMPLE_ATTRIBUTE>
+                <TAG>geographic location (country and/or sea)</TAG>
+                <VALUE>${row['geographic location (country and/or sea)']}</VALUE>
+            </SAMPLE_ATTRIBUTE>
+            </py:if>
             <py:if test="attributetest(row, 'lab_host')">
             <SAMPLE_ATTRIBUTE>
                 <TAG>lab_host</TAG>
@@ -213,6 +213,14 @@ def mandatorytest(row, column, index):
                 <VALUE>${row['strain']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
+            <py:for each="header, tag in extra_attributes.items()">
+            <py:if test="attributetest(row, header)">
+                <SAMPLE_ATTRIBUTE>
+                    <TAG>${tag}</TAG>
+                    <VALUE>${row[header]}</VALUE>
+                </SAMPLE_ATTRIBUTE>
+            </py:if>
+            </py:for>
             <SAMPLE_ATTRIBUTE>
                 <TAG>SUBMISSION_TOOL</TAG>
                 <VALUE>${tool_name}</VALUE>

diff --git a/ena_upload/templates/ENA_template_samples_ERC000012.xml b/ena_upload/templates/ENA_template_samples_ERC000012.xml
@@ -179,12 +179,6 @@ def mandatorytest(row, column, index):
                 <UNITS>m</UNITS>
             </SAMPLE_ATTRIBUTE>
             </py:if>
-            <py:if test="mandatorytest(row, 'geographic location (country and/or sea)', index)">
-            <SAMPLE_ATTRIBUTE>
-                <TAG>geographic location (country and/or sea)</TAG>
-                <VALUE>${row['geographic location (country and/or sea)']}</VALUE>
-            </SAMPLE_ATTRIBUTE>
-            </py:if>
             <py:if test="mandatorytest(row, 'geographic location (latitude)', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (latitude)</TAG>
@@ -262,6 +256,12 @@ def mandatorytest(row, column, index):
                 <VALUE>${row['sample storage duration']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
+            <py:if test="mandatorytest(row, 'geographic location (country and/or sea)', index)">
+            <SAMPLE_ATTRIBUTE>
+                <TAG>geographic location (country and/or sea)</TAG>
+                <VALUE>${row['geographic location (country and/or sea)']}</VALUE>
+            </SAMPLE_ATTRIBUTE>
+            </py:if>
             <py:if test="attributetest(row, 'host disease status')">
             <SAMPLE_ATTRIBUTE>
                 <TAG>host disease status</TAG>
@@ -516,6 +516,14 @@ def mandatorytest(row, column, index):
                 <VALUE>${row['chemical administration']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
+            <py:for each="header, tag in extra_attributes.items()">
+            <py:if test="attributetest(row, header)">
+                <SAMPLE_ATTRIBUTE>
+                    <TAG>${tag}</TAG>
+                    <VALUE>${row[header]}</VALUE>
+                </SAMPLE_ATTRIBUTE>
+            </py:if>
+            </py:for>
             <SAMPLE_ATTRIBUTE>
                 <TAG>SUBMISSION_TOOL</TAG>
                 <VALUE>${tool_name}</VALUE>