Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of translated media #280

Merged
merged 10 commits into from
Dec 3, 2021
1 change: 1 addition & 0 deletions src/formpack/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
}

GEO_QUESTION_TYPES = ('geopoint', 'geotrace', 'geoshape')
MEDIA_TYPES = ('audio', 'image', 'video')

# Export Settings
EXPORT_SETTING_FIELDS = 'fields'
Expand Down
171 changes: 122 additions & 49 deletions src/formpack/utils/expand_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,38 @@
import re
from collections import OrderedDict
from copy import deepcopy
from typing import (
Any,
Dict,
List,
Optional,
Set,
Tuple,
Union,
)

from .array_to_xpath import EXPANDABLE_FIELD_TYPES
from .iterator import get_first_occurrence
from .replace_aliases import META_TYPES, selects
from ..constants import (UNTRANSLATED, OR_OTHER_COLUMN,
TAG_COLUMNS_AND_SEPARATORS)
from ..constants import (
MEDIA_TYPES,
OR_OTHER_COLUMN,
TAG_COLUMNS_AND_SEPARATORS,
UNTRANSLATED,
)

REMOVE_EMPTY_STRINGS = True
# this will be used to check which version of formpack was used to compile the
# asset content
SCHEMA_VERSION = "1"
SCHEMA_VERSION = '1'


def _expand_translatable_content(content, row, col_shortname,
special_column_details):
def _expand_translatable_content(
content: Dict[str, List[Any]],
row: Dict[str, Union[str, List[Any]]],
col_shortname: str,
special_column_details: Dict[str, Optional[str]],
) -> None:
_scd = special_column_details
if 'translation' in _scd:
translations = content['translations']
Expand All @@ -39,7 +56,10 @@ def _expand_translatable_content(content, row, col_shortname,
del row[col_shortname]


def _expand_tags(row, tag_cols_and_seps=None):
def _expand_tags(
row: Dict[str, Union[str, List[Any]]],
tag_cols_and_seps: Optional[Dict[str, str]] = None,
) -> Dict[str, Union[str, List[Any]]]:
if tag_cols_and_seps is None:
tag_cols_and_seps = {}
tags = []
Expand All @@ -55,13 +75,16 @@ def _expand_tags(row, tag_cols_and_seps=None):
tags_str = row.pop(tag_col, None)
if tags_str and isinstance(tags_str, str):
for tag in re.findall(r'([\#\+][a-zA-Z][a-zA-Z0-9_]*)', tags_str):
tags.append('hxl:%s' % tag)
if len(tags) > 0:
tags.append(f'hxl:{tag}')
if tags:
row['tags'] = tags
return row


def _get_translations_from_special_cols(special_cols, translations):
def _get_translations_from_special_cols(
special_cols: OrderedDict,
translations: List[str],
) -> Tuple[List[str], Set[str]]:
translated_cols = []
for colname, parsedvals in iter(special_cols.items()):
if 'translation' in parsedvals:
Expand All @@ -71,8 +94,8 @@ def _get_translations_from_special_cols(special_cols, translations):
return translations, set(translated_cols)


def expand_content_in_place(content):
(specials, translations, transl_cols) = _get_special_survey_cols(content)
def expand_content_in_place(content: Dict[str, List[Any]]) -> None:
specials, translations, transl_cols = _get_special_survey_cols(content)

if len(translations) > 0:
content['translations'] = translations
Expand All @@ -94,10 +117,15 @@ def expand_content_in_place(content):
# legacy {'select_one': 'xyz'} format might
# still be on kobo-prod
_type_str = _expand_type_to_dict(
get_first_occurrence(_type.keys()))['type']
get_first_occurrence(_type.keys())
)['type']
_list_name = get_first_occurrence(_type.values())
row.update({'type': _type_str,
'select_from_list_name': _list_name})
row.update(
{
'type': _type_str,
'select_from_list_name': _list_name,
}
)

_expand_tags(row, tag_cols_and_seps=TAG_COLUMNS_AND_SEPARATORS)

Expand All @@ -111,7 +139,7 @@ def expand_content_in_place(content):
if REMOVE_EMPTY_STRINGS:
row_copy = dict(row)
for key, val in row_copy.items():
if val == "":
if val == '':
del row[key]

# for now, prepend meta questions to the beginning of the survey
Expand All @@ -133,7 +161,10 @@ def expand_content_in_place(content):
content['schema'] = SCHEMA_VERSION


def expand_content(content, in_place=False):
def expand_content(
content: Dict[str, List[Any]],
in_place: bool = False,
) -> Optional[Dict[str, List[Any]]]:
if in_place:
expand_content_in_place(content)
return None
Expand All @@ -143,7 +174,29 @@ def expand_content(content, in_place=False):
return content_copy


def _get_special_survey_cols(content):
def _get_known_translated_cols(translated_cols: List[str]) -> List[str]:
"""
This is necessary to handle a legacy issue where media attributes such as
`image`, `audio` and `video` were transformed to `media::x`, but their
value in the `translated` list was still `x` therefore not being recognized
as a "known translated" column. This resulted in a mismatch in labels and
translations and broke the exports and autoreport.
"""
if not translated_cols:
return []

_translated_cols = []
for col in translated_cols:
if col in MEDIA_TYPES:
col = f'media::{col}'
_translated_cols.append(col)

return _translated_cols


def _get_special_survey_cols(
content: Dict[str, List[Any]],
) -> Tuple[OrderedDict, List[str], List[str]]:
"""
This will extract information about columns in an xlsform with ':'s

Expand All @@ -155,20 +208,24 @@ def _get_special_survey_cols(content):
'hint::English',
For more examples, see tests.
"""
RE_MEDIA_TYPES = '|'.join(MEDIA_TYPES)

uniq_cols = OrderedDict()
special = OrderedDict()

known_translated_cols = content.get('translated', [])
known_translated_cols = _get_known_translated_cols(
content.get('translated')
)

def _pluck_uniq_cols(sheet_name):
def _pluck_uniq_cols(sheet_name: str) -> None:
for row in content.get(sheet_name, []):
# we don't want to expand columns which are already known
# to be parsed and translated in a previous iteration
_cols = [r for r in row.keys() if r not in known_translated_cols]

uniq_cols.update(OrderedDict.fromkeys(_cols))

def _mark_special(**kwargs):
def _mark_special(**kwargs: str) -> None:
column_name = kwargs.pop('column_name')
special[column_name] = kwargs

Expand All @@ -177,57 +234,73 @@ def _mark_special(**kwargs):

for column_name in uniq_cols.keys():
if column_name in ['label', 'hint']:
_mark_special(column_name=column_name,
column=column_name,
translation=UNTRANSLATED)
if ':' not in column_name:
_mark_special(
column_name=column_name,
column=column_name,
translation=UNTRANSLATED,
)
if ':' not in column_name and column_name not in MEDIA_TYPES:
continue
if column_name.startswith('bind:'):
continue
if column_name.startswith('body:'):
continue
mtch = re.match(r'^media\s*::?\s*([^:]+)\s*::?\s*([^:]+)$', column_name)
mtch = re.match(
rf'^(media\s*::?\s*)?({RE_MEDIA_TYPES})\s*::?\s*([^:]+)$',
column_name,
)
if mtch:
matched = mtch.groups()
media_type = matched[0]
_mark_special(column_name=column_name,
column='media::{}'.format(media_type),
coltype='media',
media=media_type,
translation=matched[1])
media_type = matched[1]
translation = matched[2]
_mark_special(
column_name=column_name,
column='media::{}'.format(media_type),
coltype='media',
media=media_type,
translation=translation,
)
continue
mtch = re.match(r'^media\s*::?\s*([^:]+)$', column_name)
mtch = re.match(rf'^(media\s*::?\s*)?({RE_MEDIA_TYPES})$', column_name)
if mtch:
media_type = mtch.groups()[0]
_mark_special(column_name=column_name,
column='media::{}'.format(media_type),
coltype='media',
media=media_type,
translation=UNTRANSLATED)
matched = mtch.groups()
media_type = matched[1]
_mark_special(
column_name=column_name,
column='media::{}'.format(media_type),
coltype='media',
media=media_type,
translation=UNTRANSLATED,
)
continue
mtch = re.match(r'^([^:]+)\s*::?\s*([^:]+)$', column_name)
if mtch:
# example: label::x, constraint_message::x, hint::x
matched = mtch.groups()
column_shortname = matched[0]
_mark_special(column_name=column_name,
column=column_shortname,
translation=matched[1])
_mark_special(
column_name=column_name,
column=column_shortname,
translation=matched[1],
)

# also add the empty column if it exists
if column_shortname in uniq_cols:
_mark_special(column_name=column_shortname,
column=column_shortname,
translation=UNTRANSLATED)
_mark_special(
column_name=column_shortname,
column=column_shortname,
translation=UNTRANSLATED,
)
continue
(translations,
translated_cols) = _get_translations_from_special_cols(special,
content.get('translations', []))
translations, translated_cols = _get_translations_from_special_cols(
special,
content.get('translations', []),
)
translated_cols.update(known_translated_cols)
return special, translations, sorted(translated_cols)


def _expand_type_to_dict(type_str):
def _expand_type_to_dict(type_str: str) -> Dict[str, Union[str, bool]]:
SELECT_PATTERN = r'^({select_type})\s+(\S+)$'
out = {}
match = re.search('( or.other)$', type_str)
Expand Down Expand Up @@ -255,6 +328,6 @@ def _expand_type_to_dict(type_str):
return {'type': type_str}


def _expand_xpath_to_list(xpath_string):
def _expand_xpath_to_list(xpath_string: str) -> str:
# a placeholder for a future expansion
return xpath_string
40 changes: 40 additions & 0 deletions tests/test_expand_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,46 @@ def test_expand_translated_media():
}


def test_expand_translated_media_mangled_format():
"""
An unfortunate bug seen in formpack#115 has resulted in needing to account
for this behaviour if surveys used image::lang rather than
media::image::lang
"""
s1 = {
'survey': [
{
'type': 'note',
'media::image': ['eng.jpg'],
},
],
'translated': ['image'], # Bug 🐛: not coming through as media::image
'schema': SCHEMA_VERSION,
'translations': ['English (en)']
}
expand_content(s1, in_place=True)
assert s1 == {
'survey': [
{
'type': 'note',
'media::image': ['eng.jpg'],
},
],
'translated': ['media::image'],
'schema': SCHEMA_VERSION,
'translations': ['English (en)'],
}
flatten_content(s1, in_place=True)
assert s1 == {
'survey': [
{
'type': 'note',
'media::image::English (en)': 'eng.jpg',
},
],
}


def test_expand_translated_media_with_no_translated():
s1 = {'survey': [{'type': 'note',
'media::image': 'nolang.jpg',
Expand Down