Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i16 Process subsets #862

Merged
merged 10 commits into from
Apr 24, 2024
99 changes: 84 additions & 15 deletions app/services/ams/missing_instantiations_locator.rb
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
# frozen_string_literal: true
require 'ruby-progressbar'
require 'parallel'

module AMS
# @see https://github.com/scientist-softserv/ams/issues/16
class MissingInstantiationsLocator
class MissingInstantiationsLocator # rubocop:disable Metrics/ClassLength
WORKING_DIR = Rails.root.join('tmp', 'imports')

attr_reader :search_dirs, :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger
attr_reader :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger

# @param [Array<String>] search_dirs
def initialize(search_dirs)
@search_dirs = search_dirs.map { |dir| WORKING_DIR.join(dir) }
@logger = ActiveSupport::Logger.new(
WORKING_DIR.join('i16-missing-instantiations-locator.log')
)
def initialize
@logger = Logger.new(WORKING_DIR.join('i16-missing-instantiations-locator.log'))
end

def map_all_instantiation_identifiers
# @param [Array<String>] search_dirs
def map_all_instantiation_identifiers(search_dirnames)
search_dirs = search_dirnames.map { |dir| WORKING_DIR.join(dir) }
search_dirs.each do |current_dir|
@current_dir = current_dir
@truncated_dir_name = File.basename(current_dir)
Expand Down Expand Up @@ -57,27 +56,97 @@ def merge_all_instantiation_maps
end
end

def create_subsets_from_merged_map
# @param [Integer] num_processes
def create_subsets_from_merged_map(num_processes: 4)
results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json')))
uniq_assset_paths = results.values.flatten.uniq
subsets = uniq_assset_paths.each_slice(10_000).to_a
uniq_asset_paths = results.values.flatten.uniq
subsets = uniq_asset_paths.each_slice(10_000).to_a

subsets.each_with_index do |set, i|
Parallel.each_with_index(subsets, in_processes: num_processes) do |set, i|
set_path = WORKING_DIR.join("i16-subset-#{i}")
FileUtils.mkdir_p(set_path)
pb_format = "Copying XML files to #{File.basename(set_path)}: %c/%C %P%"
pb_format = "Copying XML files to #{File.basename(set_path)}: %a %e %c/%C %P%"
progressbar = ProgressBar.create(total: set.size, format: pb_format)

set.each do |asset_path|
importer_dir, asset_id = asset_path.split('/')
xml_filename = "#{asset_id.sub('cpb-aacip-', '')}.xml"

FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename))
if File.exist?(WORKING_DIR.join(set_path, xml_filename))
logger.debug "#{xml_filename} already exists in #{File.basename(set_path)}"
else
begin
FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename))
rescue => e
logger.error "#{e.class} - (#{File.basename(set_path)}/#{xml_filename}) - #{e.message}"
end
end
progressbar.increment
end
end
end

def audit_duplicate_xml_files
results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json')))
asset_paths = results.values.flatten.uniq
filename_map = {}

asset_paths.each do |path|
path, asset_id = path.split('/')
filename = "#{asset_id.sub('cpb-aacip-', '')}.xml"

filename_map[filename] ||= {}
filename_map[filename][:paths] ||= []
filename_map[filename][:paths] << path
end

duplicate_files = filename_map.select { |_filename, attrs| attrs[:paths].size > 1 }

duplicate_files.each do |filename, attrs|
file_contents = attrs[:paths].map { |path| File.read(WORKING_DIR.join(path, filename)) }
duplicate_files[filename][:content_differs] = file_contents.uniq.size > 1
end

File.open(WORKING_DIR.join('i16-duplicate-xml-files-audit.json'), 'w') do |file|
file.puts JSON.pretty_generate(duplicate_files)
end
end

def destroy_assets(subset_path)
xml_files = Dir.glob(subset_path.join('*.xml'))
asset_ids = xml_files.map { |f| "cpb-aacip-#{File.basename(f).sub('.xml', '')}" }

begin
AMS::AssetDestroyer.new(asset_ids: asset_ids, user_email: 'wgbh_admin@wgbh-mla.org').destroy
rescue => e
logger.error "Error destroying Assets. See asset_destroyer.log (#{e.class} - #{e.message})"
end
end

def create_subset_importers
subset_paths = Dir.glob(Rails.root.join('tmp', 'imports', 'i16-subset*'))
desired_parser_field_attrs = %w[
record_element
import_type
visibility
rights_statement
override_rights_statement
file_style
import_file_path
]

subset_paths.each do |path|
base_imp = Bulkrax::Importer.find_by(name: 'AMS1Importer_0-10000')
imp = base_imp.dup

imp.name = File.basename(path)
imp.parser_fields = base_imp.parser_fields.slice(*desired_parser_field_attrs)
imp.parser_fields['import_file_path'] = path.to_s

imp.save!
end
end

private

def map_asset_id_to_inst_ids(xml_file)
Expand Down
3 changes: 2 additions & 1 deletion db/schema.test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2023_08_30_155065) do
ActiveRecord::Schema.define(version: 2024_03_07_053156) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand Down Expand Up @@ -414,6 +414,7 @@
t.datetime "updated_at", null: false
t.string "internal_resource"
t.integer "lock_version"
t.index "(((metadata -> 'bulkrax_identifier'::text) ->> 0))", name: "index_on_bulkrax_identifier", where: "((metadata -> 'bulkrax_identifier'::text) IS NOT NULL)"
t.index ["internal_resource"], name: "index_orm_resources_on_internal_resource"
t.index ["metadata"], name: "index_orm_resources_on_metadata", using: :gin
t.index ["metadata"], name: "index_orm_resources_on_metadata_jsonb_path_ops", opclass: :jsonb_path_ops, using: :gin
Expand Down
Loading