diff --git a/setup.py b/setup.py index 27333f16..0cf78c66 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ 'path.py<12', # Pinned for Python 2 compatibility 'pyquery', 'pyxform', + 'orderedset', 'statistics', 'XlsxWriter', 'backports.csv', # Remove after dropping Python 2 support (and rewrite `imports`) @@ -41,6 +42,7 @@ 'path.py', 'pyquery', 'pyxform', + 'orderedset', 'statistics', 'XlsxWriter', 'backports.csv', # Remove after dropping Python 2 support (and rewrite `imports`) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index e1dfe825..b6d0f8fd 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -6,6 +6,7 @@ from __future__ import (unicode_literals, print_function, absolute_import, division) from copy import deepcopy +from orderedset import OrderedSet import re from .array_to_xpath import EXPANDABLE_FIELD_TYPES @@ -158,17 +159,7 @@ def _get_special_survey_cols(content): 'hint::English', For more examples, see tests. """ - uniq_cols_set = set() - uniq_cols = [] - """ - The reason for two separate data structures is performance. The goal is to have a unique - set that preserves insertion order. - - We implement that by using set() for uniqueness and list() for order. - - Python has OrderedDict that provides that functionality, but the performance is slightly - worse compared to this solution. - """ + uniq_cols = OrderedSet() special = OrderedDict() @@ -180,11 +171,7 @@ def _pluck_uniq_cols(sheet_name): # to be parsed and translated in a previous iteration _cols = [r for r in row.keys() if r not in known_translated_cols] - for _col in _cols: - if _col in uniq_cols_set: - continue - uniq_cols_set.add(_col) - uniq_cols.append(_col) + uniq_cols.update(_cols) def _mark_special(**kwargs): column_name = kwargs.pop('column_name') @@ -233,7 +220,7 @@ def _mark_special(**kwargs): translation=matched[1]) # also add the empty column if it exists - if column_shortname in uniq_cols_set: + if column_shortname in uniq_cols: _mark_special(column_name=column_shortname, column=column_shortname, translation=UNTRANSLATED)