Skip to content

Commit

Permalink
fix: start work matching xlsform translation columns
Browse files Browse the repository at this point in the history
  • Loading branch information
spwoodcock committed Sep 20, 2024
1 parent 5d04c1a commit ed7d1bd
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 16 deletions.
71 changes: 55 additions & 16 deletions osm_fieldwork/update_xlsform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,20 @@
SURVEY_GROUP_NAME = "survey_questions"


def filter_df_empty_rows(df: pd.DataFrame, column: str = NAME_COLUMN):
"""Remove rows with None values in the specified column.
NOTE We retain 'end group' and 'end group' rows even if they have no name.
NOTE A generic df.dropna(how="all") would not catch accidental spaces etc.
"""
if column in df.columns:
# Only retain 'begin group' and 'end group' if 'type' column exists
if "type" in df.columns:
return df[(df[column].notna()) | (df["type"].isin(["begin group", "end group", "begin_group", "end_group"]))]
else:
return df[df[column].notna()]
return df


def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame):
"""Merge multiple Pandas dataframes together, removing duplicate fields."""
# Remove empty rows from dataframes
mandatory_df = filter_df_empty_rows(mandatory_df)
user_question_df = filter_df_empty_rows(user_question_df)
digitisation_df = filter_df_empty_rows(digitisation_df)

# Handle matching translation fields for label, hint, required_message, etc.
# FIXME this isn't working properly yet
# mandatory_df, user_question_df, digitisation_df = handle_translations(
# mandatory_df, user_question_df, digitisation_df, fields=["label", "hint", "required_message"]
# )

# Find common fields between user_question_df and mandatory_df or digitisation_df
# We use this to remove duplicates from the survey, giving our fields priority
duplicate_fields = set(user_question_df[NAME_COLUMN]).intersection(
set(mandatory_df[NAME_COLUMN]).union(set(digitisation_df[NAME_COLUMN]))
)
Expand Down Expand Up @@ -84,6 +74,55 @@ def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame,
)


def handle_translations(
mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame, fields: list[str]
):
"""Handle translations, defaulting to English if no translations are present.
Handles all field types that can be translated, such as
'label', 'hint', 'required_message'.
"""
for field in fields:
# Identify translation columns for this field in the user_question_df
translation_columns = [col for col in user_question_df.columns if col.startswith(f"{field}::")]

if field in user_question_df.columns and not translation_columns:
# If user_question_df has only the base field (e.g., 'label'), map English translation from mandatory and digitisation
mandatory_df[field] = mandatory_df.get(f"{field}::English(en)", mandatory_df.get(field))
digitisation_df[field] = digitisation_df.get(f"{field}::English(en)", digitisation_df.get(field))

# Then drop translation columns
mandatory_df = mandatory_df.loc[:, ~mandatory_df.columns.str.startswith("label::")]
digitisation_df = digitisation_df.loc[:, ~digitisation_df.columns.str.startswith("label::")]

else:
# If translation columns exist, match them for mandatory and digitisation dataframes
for col in translation_columns:
mandatory_col = mandatory_df.get(col)
digitisation_col = digitisation_df.get(col)
if mandatory_col is not None:
mandatory_df[col] = mandatory_col
if digitisation_col is not None:
digitisation_df[col] = digitisation_col

return mandatory_df, user_question_df, digitisation_df


def filter_df_empty_rows(df: pd.DataFrame, column: str = NAME_COLUMN):
"""Remove rows with None values in the specified column.
NOTE We retain 'end group' and 'end group' rows even if they have no name.
NOTE A generic df.dropna(how="all") would not catch accidental spaces etc.
"""
if column in df.columns:
# Only retain 'begin group' and 'end group' if 'type' column exists
if "type" in df.columns:
return df[(df[column].notna()) | (df["type"].isin(["begin group", "end group", "begin_group", "end_group"]))]
else:
return df[df[column].notna()]
return df


def create_survey_group(name: str) -> dict[str, pd.DataFrame]:
"""Helper function to create a begin and end group for XLSForm."""
begin_group = pd.DataFrame(
Expand Down
31 changes: 31 additions & 0 deletions tests/test_update_xlsform.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ async def test_merge_mandatory_fields():
# Check it's still a valid xlsform by converting to XML
xform_convert(updated_form)

# Check if translations were matched correctly
# FIXME enable once code fixed
# translation_found, label_field_found = check_translation_fields(workbook)
# assert not translation_found, "Translation fields should have been removed during merge."
# assert label_field_found, "The 'label' field should be present after merge."


async def test_add_extra_select_from_file():
"""Append extra select_one_from_file questions based on Entity list names."""
Expand Down Expand Up @@ -94,6 +100,11 @@ async def test_buildings_xlsform():
# Check it's still a valid xlsform by converting to XML
xform_convert(updated_form)

workbook = load_workbook(filename=BytesIO(updated_form.getvalue()))
translation_found, label_field_found = check_translation_fields(workbook)
assert translation_found, "'label::English(en)' field not found in the survey sheet."
assert not label_field_found, "'label' field should not be present after merging translations."


async def test_healthcare_xlsform():
"""Merge and test if buildings form is a valid XLSForm."""
Expand Down Expand Up @@ -152,6 +163,26 @@ def check_form_title(workbook: Workbook) -> None:
assert form_title_value == "building", "form_title field is not set to 'building'"


def check_translation_fields(workbook: Workbook):
"""Check if translation fields were correctly matched."""
survey_sheet = workbook["survey"]
translation_found = False
label_field_found = False

# Iterate through the survey sheet columns and rows
for row in survey_sheet.iter_rows(min_row=1, max_col=survey_sheet.max_column):
for cell in row:
# Check if the English translation label exists
if cell.value == "label::English(en)":
translation_found = True

# Ensure that the base 'label' field is no longer present
if cell.value == "label":
label_field_found = True

return translation_found, label_field_found


def get_sheet(workbook: Workbook, sheet_name: str) -> worksheet.worksheet.Worksheet:
"""Helper function to get a sheet or raise an error."""
if sheet_name not in workbook.sheetnames:
Expand Down
Binary file modified tests/testdata/test_form_for_mandatory_fields.xls
Binary file not shown.

0 comments on commit ed7d1bd

Please sign in to comment.