From e43449efa3e9a0e08adf5d353ae3afbd5546ef0c Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 17 Apr 2024 10:36:55 -0700 Subject: [PATCH] add method to audit duplicate XML files Specifically, we're auditing whether their content differs from each other or not. If not, we aren't concerned with handling the duplicates further, but if the content does differ, we need to figure out how --- .../ams/missing_instantiations_locator.rb | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index ceb8c938..4032ee81 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -82,6 +82,32 @@ def create_subsets_from_merged_map(num_processes: 4) end end + def audit_duplicate_xml_files + results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json'))) + asset_paths = results.values.flatten.uniq + filename_map = {} + + asset_paths.each do |path| + path, asset_id = path.split('/') + filename = "#{asset_id.sub('cpb-aacip-', '')}.xml" + + filename_map[filename] ||= {} + filename_map[filename][:paths] ||= [] + filename_map[filename][:paths] << path + end + + duplicate_files = filename_map.select { |_filename, attrs| attrs[:paths].size > 1 } + + duplicate_files.each do |filename, attrs| + file_contents = attrs[:paths].map { |path| File.read(WORKING_DIR.join(path, filename)) } + duplicate_files[filename][:content_differs] = file_contents.uniq.size > 1 + end + + File.open(WORKING_DIR.join('i16-duplicate-xml-files-audit.json'), 'w') do |file| + file.puts JSON.pretty_generate(duplicate_files) + end + end + private def map_asset_id_to_inst_ids(xml_file)