Choose the top priority detect format for all directory depths #839

vinnamkim · 2023-03-08T03:46:33Z

Summary

Ticket no. 105527

Datumaro's current priority logic for detecting format has been working only for the same directory depth. Please see

datumaro/datumaro/components/format_detection.py

Lines 455 to 538 in 6b613c7

    
           def detect_dataset_format( 
        
               formats: Iterable[Tuple[str, FormatDetector]], 
        
               path: str, 
        
               *, 
        
               rejection_callback: Optional[RejectionCallback] = None, 
        
           ) -> Sequence[str]: 
        
               """ 
        
               Determines which format(s) the dataset at the specified path belongs to. 
        
               The function applies each supplied detector to the given patch and decides 
        
               whether the corresponding format is detected or rejected. A format may be 
        
               rejected if the detector fails or if it succeeds with less confidence than 
        
               another detector (other rejection reasons might be added in the future). 
        
               Args: 
        
                   `formats`: The formats to be considered. Each element of the 
        
                       iterable must be a tuple of a format name and a `FormatDetector` 
        
                       instance. 
        
                   `path`: the filesystem path to the dataset to be analyzed. 
        
                   `rejection_callback`: Unless `None`, called for every rejected format 
        
                       to report the reason it was rejected. 
        
               Returns: a sequence of detected format names. 
        
               """ 
        
               if not osp.exists(path): 
        
                   raise FileNotFoundError(f"Path {path} doesn't exist") 
        
               def report_insufficient_confidence( 
        
                   format_name: str, 
        
                   format_with_more_confidence: str, 
        
               ): 
        
                   if rejection_callback: 
        
                       rejection_callback( 
        
                           format_name, 
        
                           RejectionReason.insufficient_confidence, 
        
                           f"Another format ({format_with_more_confidence}) " 
        
                           "was matched with more confidence", 
        
                       ) 
        
               max_confidence = 0 
        
               matches = [] 
        
               for format_name, detector in formats: 
        
                   log.debug("Checking '%s' format...", format_name) 
        
                   try: 
        
                       new_confidence = apply_format_detector(path, detector) 
        
                   except _FormatRejected as ex: 
        
                       human_message = str(ex) 
        
                       if rejection_callback: 
        
                           rejection_callback(format_name, ex.reason, human_message) 
        
                       log.debug(human_message) 
        
                   else: 
        
                       log.debug("Format matched with confidence %d", new_confidence) 
        
                       # keep only matches with the highest confidence 
        
                       if new_confidence > max_confidence: 
        
                           for match in matches: 
        
                               report_insufficient_confidence(match, format_name) 
        
                           matches = [format_name] 
        
                           max_confidence = new_confidence 
        
                       elif new_confidence == max_confidence: 
        
                           matches.append(format_name) 
        
                       else:  # new confidence is less than max 
        
                           report_insufficient_confidence(format_name, matches[0]) 
        
               # TODO: This should be controlled by our priority logic. 
        
               # However, some datasets' detect() are currently broken, 
        
               # so that it is inevitable to introduce this. 
        
               # We must revisit this after fixing detect(). 
        
               def _give_more_priority_to_with_subset_dirs(matches): 
        
                   for idx, match in enumerate(matches): 
        
                       if match + "_with_subset_dirs" in matches: 
        
                           matches = matches.pop(idx) 
        
                           return True 
        
                   return False 
        
               while _give_more_priority_to_with_subset_dirs(matches): 
        
                   continue 
        
               return matches

We should expand this to consider the priorities coming from all directory depths and choose the top-priority one. This should be done in

datumaro/datumaro/components/environment.py

Lines 247 to 275 in 6b613c7

    
           def detect_dataset( 
        
               self, 
        
               path: str, 
        
               depth: int = 1, 
        
               rejection_callback: Optional[Callable[[str, RejectionReason, str], None]] = None, 
        
           ) -> List[str]: 
        
               ignore_dirs = {"__MSOSX", "__MACOSX"} 
        
               matched_formats = set() 
        
               for _ in range(depth + 1): 
        
                   detected_formats = detect_dataset_format( 
        
                       ( 
        
                           (format_name, importer.detect) 
        
                           for format_name, importer in self.importers.items.items() 
        
                       ), 
        
                       path, 
        
                       rejection_callback=rejection_callback, 
        
                   ) 
        
                   if detected_formats and len(detected_formats) == 1: 
        
                       return detected_formats 
        
                   elif detected_formats: 
        
                       matched_formats |= set(detected_formats) 
        
                   paths = glob.glob(osp.join(path, "*")) 
        
                   path = "" if len(paths) != 1 else paths[0] 
        
                   if not osp.isdir(path) or osp.basename(path) in ignore_dirs: 
        
                       break 
        
               return list(matched_formats)

How to test

This change will be covered by the existing tests.

Checklist

I submit my changes into the develop branch
I have added description of my changes into CHANGELOG
I have updated the documentation accordingly
I have added tests to cover my changes
I have linked related issues

License

I submit my code changes under the same MIT License that covers the project.
Feel free to contact the maintainers if that's a concern.
I have updated the license header for each file (see an example below)

# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>

codecov-commenter · 2023-03-08T03:59:31Z

Codecov Report

Patch coverage: 100.00% and project coverage change: +0.01 🎉

Comparison is base (a44fc79) 78.47% compared to head (c9d6332) 78.49%.

Additional details and impacted files

@@             Coverage Diff             @@
##           develop     #839      +/-   ##
===========================================
+ Coverage    78.47%   78.49%   +0.01%     
===========================================
  Files          196      196              
  Lines        24179    24189      +10     
  Branches      4906     4907       +1     
===========================================
+ Hits         18974    18986      +12     
+ Misses        4114     4113       -1     
+ Partials      1091     1090       -1

Flag	Coverage Δ
macos-11_Python-3.8	`77.82% <100.00%> (+0.01%)`	⬆️
ubuntu-20.04_Python-3.8	`78.48% <100.00%> (+0.01%)`	⬆️
windows-2019_Python-3.8	`78.42% <100.00%> (+0.01%)`	⬆️

Flags with carried forward coverage won't be shown. Click here to find out more.

Impacted Files	Coverage Δ
datumaro/components/environment.py	`90.86% <100.00%> (-0.05%)`	⬇️
datumaro/components/format_detection.py	`93.54% <100.00%> (+0.40%)`	⬆️

... and 1 file with indirect coverage changes

Help us with your feedback. Take ten seconds to tell us how you rate us. Have a feature suggestion? Share it here.

☔ View full report in Codecov by Sentry.
📢 Do you have feedback about the report comment? Let us know in this issue.

…p-priority-for-all-depths Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>

wonjuleee

It looks good to me!

…p-priority-for-all-depths

vinnamkim added 3 commits March 8, 2023 12:11

Make detect_dataset_format() return confidence too

01624c2

Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>

Choose the highest prioritized one searched for all depths

6423621

Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>

Update CHANGELOG.md and licenses

fb87b94

Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>

vinnamkim added BUG Something isn't working data formats PR is related to dataset formats labels Mar 8, 2023

vinnamkim added this to the 1.1.0 milestone Mar 8, 2023

vinnamkim marked this pull request as ready for review March 8, 2023 03:51

vinnamkim requested review from wonjuleee, sooahleex, chuneuny-emily and bonhunko March 8, 2023 03:51

Merge remote-tracking branch 'upstream/develop' into bugfix/choose-to…

e45e80d

…p-priority-for-all-depths Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>

This was referenced Mar 8, 2023

Fix celeba and align_celeba detect function #837

Merged

Fix MVTec format detect function #843

Merged

wonjuleee previously approved these changes Mar 10, 2023

View reviewed changes

Merge branch 'develop' into bugfix/choose-top-priority-for-all-depths

dff9a23

vinnamkim dismissed wonjuleee’s stale review via dff9a23 March 10, 2023 01:33

Merge remote-tracking branch 'upstream/develop' into bugfix/choose-to…

c9d6332

…p-priority-for-all-depths

vinnamkim requested a review from wonjuleee March 10, 2023 03:45

wonjuleee approved these changes Mar 13, 2023

View reviewed changes

vinnamkim merged commit f1c467a into openvinotoolkit:develop Mar 13, 2023

vinnamkim deleted the bugfix/choose-top-priority-for-all-depths branch March 13, 2023 02:03

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Choose the top priority detect format for all directory depths #839

Choose the top priority detect format for all directory depths #839

vinnamkim commented Mar 8, 2023 •

edited

Loading

codecov-commenter commented Mar 8, 2023 •

edited

Loading

wonjuleee left a comment

	def detect_dataset_format(
	formats: Iterable[Tuple[str, FormatDetector]],
	path: str,
	*,
	rejection_callback: Optional[RejectionCallback] = None,
	) -> Sequence[str]:
	"""
	Determines which format(s) the dataset at the specified path belongs to.

	The function applies each supplied detector to the given patch and decides
	whether the corresponding format is detected or rejected. A format may be
	rejected if the detector fails or if it succeeds with less confidence than
	another detector (other rejection reasons might be added in the future).

	Args:
	`formats`: The formats to be considered. Each element of the
	iterable must be a tuple of a format name and a `FormatDetector`
	instance.

	`path`: the filesystem path to the dataset to be analyzed.

	`rejection_callback`: Unless `None`, called for every rejected format
	to report the reason it was rejected.

	Returns: a sequence of detected format names.
	"""

	if not osp.exists(path):
	raise FileNotFoundError(f"Path {path} doesn't exist")

	def report_insufficient_confidence(
	format_name: str,
	format_with_more_confidence: str,
	):
	if rejection_callback:
	rejection_callback(
	format_name,
	RejectionReason.insufficient_confidence,
	f"Another format ({format_with_more_confidence}) "
	"was matched with more confidence",
	)

	max_confidence = 0
	matches = []

	for format_name, detector in formats:
	log.debug("Checking '%s' format...", format_name)
	try:
	new_confidence = apply_format_detector(path, detector)
	except _FormatRejected as ex:
	human_message = str(ex)
	if rejection_callback:
	rejection_callback(format_name, ex.reason, human_message)
	log.debug(human_message)
	else:
	log.debug("Format matched with confidence %d", new_confidence)

	# keep only matches with the highest confidence
	if new_confidence > max_confidence:
	for match in matches:
	report_insufficient_confidence(match, format_name)

	matches = [format_name]
	max_confidence = new_confidence
	elif new_confidence == max_confidence:
	matches.append(format_name)
	else: # new confidence is less than max
	report_insufficient_confidence(format_name, matches[0])

	# TODO: This should be controlled by our priority logic.
	# However, some datasets' detect() are currently broken,
	# so that it is inevitable to introduce this.
	# We must revisit this after fixing detect().
	def _give_more_priority_to_with_subset_dirs(matches):
	for idx, match in enumerate(matches):
	if match + "_with_subset_dirs" in matches:
	matches = matches.pop(idx)
	return True
	return False

	while _give_more_priority_to_with_subset_dirs(matches):
	continue

	return matches

	def detect_dataset(
	self,
	path: str,
	depth: int = 1,
	rejection_callback: Optional[Callable[[str, RejectionReason, str], None]] = None,
	) -> List[str]:
	ignore_dirs = {"__MSOSX", "__MACOSX"}
	matched_formats = set()
	for _ in range(depth + 1):
	detected_formats = detect_dataset_format(
	(
	(format_name, importer.detect)
	for format_name, importer in self.importers.items.items()
	),
	path,
	rejection_callback=rejection_callback,
	)

	if detected_formats and len(detected_formats) == 1:
	return detected_formats
	elif detected_formats:
	matched_formats \|= set(detected_formats)

	paths = glob.glob(osp.join(path, "*"))
	path = "" if len(paths) != 1 else paths[0]
	if not osp.isdir(path) or osp.basename(path) in ignore_dirs:
	break

	return list(matched_formats)

Choose the top priority detect format for all directory depths #839

Choose the top priority detect format for all directory depths #839

Conversation

vinnamkim commented Mar 8, 2023 • edited Loading

Summary

How to test

Checklist

License

codecov-commenter commented Mar 8, 2023 • edited Loading

Codecov Report

wonjuleee left a comment

Choose a reason for hiding this comment

vinnamkim commented Mar 8, 2023 •

edited

Loading

codecov-commenter commented Mar 8, 2023 •

edited

Loading