fix: start work matching xlsform translation columns

hotosm · Sep 20, 2024 · ed7d1bd · ed7d1bd
1 parent 5d04c1a
commit ed7d1bd
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 16 deletions.
diff --git a/osm_fieldwork/update_xlsform.py b/osm_fieldwork/update_xlsform.py
@@ -18,30 +18,20 @@
 SURVEY_GROUP_NAME = "survey_questions"
 
 
-def filter_df_empty_rows(df: pd.DataFrame, column: str = NAME_COLUMN):
-    """Remove rows with None values in the specified column.
-
-    NOTE We retain 'end group' and 'end group' rows even if they have no name.
-    NOTE A generic df.dropna(how="all") would not catch accidental spaces etc.
-    """
-    if column in df.columns:
-        # Only retain 'begin group' and 'end group' if 'type' column exists
-        if "type" in df.columns:
-            return df[(df[column].notna()) | (df["type"].isin(["begin group", "end group", "begin_group", "end_group"]))]
-        else:
-            return df[df[column].notna()]
-    return df
-
-
 def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame):
     """Merge multiple Pandas dataframes together, removing duplicate fields."""
     # Remove empty rows from dataframes
     mandatory_df = filter_df_empty_rows(mandatory_df)
     user_question_df = filter_df_empty_rows(user_question_df)
     digitisation_df = filter_df_empty_rows(digitisation_df)
 
+    # Handle matching translation fields for label, hint, required_message, etc.
+    # FIXME this isn't working properly yet
+    # mandatory_df, user_question_df, digitisation_df = handle_translations(
+    #     mandatory_df, user_question_df, digitisation_df, fields=["label", "hint", "required_message"]
+    # )
+
     # Find common fields between user_question_df and mandatory_df or digitisation_df
-    # We use this to remove duplicates from the survey, giving our fields priority
     duplicate_fields = set(user_question_df[NAME_COLUMN]).intersection(
         set(mandatory_df[NAME_COLUMN]).union(set(digitisation_df[NAME_COLUMN]))
     )
@@ -84,6 +74,55 @@ def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame,
     )
 
 
+def handle_translations(
+    mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame, fields: list[str]
+):
+    """Handle translations, defaulting to English if no translations are present.
+
+    Handles all field types that can be translated, such as
+    'label', 'hint', 'required_message'.
+    """
+    for field in fields:
+        # Identify translation columns for this field in the user_question_df
+        translation_columns = [col for col in user_question_df.columns if col.startswith(f"{field}::")]
+
+        if field in user_question_df.columns and not translation_columns:
+            # If user_question_df has only the base field (e.g., 'label'), map English translation from mandatory and digitisation
+            mandatory_df[field] = mandatory_df.get(f"{field}::English(en)", mandatory_df.get(field))
+            digitisation_df[field] = digitisation_df.get(f"{field}::English(en)", digitisation_df.get(field))
+
+            # Then drop translation columns
+            mandatory_df = mandatory_df.loc[:, ~mandatory_df.columns.str.startswith("label::")]
+            digitisation_df = digitisation_df.loc[:, ~digitisation_df.columns.str.startswith("label::")]
+
+        else:
+            # If translation columns exist, match them for mandatory and digitisation dataframes
+            for col in translation_columns:
+                mandatory_col = mandatory_df.get(col)
+                digitisation_col = digitisation_df.get(col)
+                if mandatory_col is not None:
+                    mandatory_df[col] = mandatory_col
+                if digitisation_col is not None:
+                    digitisation_df[col] = digitisation_col
+
+    return mandatory_df, user_question_df, digitisation_df
+
+
+def filter_df_empty_rows(df: pd.DataFrame, column: str = NAME_COLUMN):
+    """Remove rows with None values in the specified column.
+
+    NOTE We retain 'end group' and 'end group' rows even if they have no name.
+    NOTE A generic df.dropna(how="all") would not catch accidental spaces etc.
+    """
+    if column in df.columns:
+        # Only retain 'begin group' and 'end group' if 'type' column exists
+        if "type" in df.columns:
+            return df[(df[column].notna()) | (df["type"].isin(["begin group", "end group", "begin_group", "end_group"]))]
+        else:
+            return df[df[column].notna()]
+    return df
+
+
 def create_survey_group(name: str) -> dict[str, pd.DataFrame]:
     """Helper function to create a begin and end group for XLSForm."""
     begin_group = pd.DataFrame(

diff --git a/tests/test_update_xlsform.py b/tests/test_update_xlsform.py
@@ -48,6 +48,12 @@ async def test_merge_mandatory_fields():
     # Check it's still a valid xlsform by converting to XML
     xform_convert(updated_form)
 
+    # Check if translations were matched correctly
+    # FIXME enable once code fixed
+    # translation_found, label_field_found = check_translation_fields(workbook)
+    # assert not translation_found, "Translation fields should have been removed during merge."
+    # assert label_field_found, "The 'label' field should be present after merge."
+
 
 async def test_add_extra_select_from_file():
     """Append extra select_one_from_file questions based on Entity list names."""
@@ -94,6 +100,11 @@ async def test_buildings_xlsform():
     # Check it's still a valid xlsform by converting to XML
     xform_convert(updated_form)
 
+    workbook = load_workbook(filename=BytesIO(updated_form.getvalue()))
+    translation_found, label_field_found = check_translation_fields(workbook)
+    assert translation_found, "'label::English(en)' field not found in the survey sheet."
+    assert not label_field_found, "'label' field should not be present after merging translations."
+
 
 async def test_healthcare_xlsform():
     """Merge and test if buildings form is a valid XLSForm."""
@@ -152,6 +163,26 @@ def check_form_title(workbook: Workbook) -> None:
     assert form_title_value == "building", "form_title field is not set to 'building'"
 
 
+def check_translation_fields(workbook: Workbook):
+    """Check if translation fields were correctly matched."""
+    survey_sheet = workbook["survey"]
+    translation_found = False
+    label_field_found = False
+
+    # Iterate through the survey sheet columns and rows
+    for row in survey_sheet.iter_rows(min_row=1, max_col=survey_sheet.max_column):
+        for cell in row:
+            # Check if the English translation label exists
+            if cell.value == "label::English(en)":
+                translation_found = True
+
+            # Ensure that the base 'label' field is no longer present
+            if cell.value == "label":
+                label_field_found = True
+
+    return translation_found, label_field_found
+
+
 def get_sheet(workbook: Workbook, sheet_name: str) -> worksheet.worksheet.Worksheet:
     """Helper function to get a sheet or raise an error."""
     if sheet_name not in workbook.sheetnames:

diff --git a/tests/testdata/test_form_for_mandatory_fields.xls b/tests/testdata/test_form_for_mandatory_fields.xls