diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc index 7b163dd923a31..68b1aab919c3d 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc +++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc @@ -197,7 +197,6 @@ void InitBeamState(transformers::IBeamSearchState* beam_state, // T5 does not need position, so next_positions is empty for T5. if (!beam_state->next_positions.empty()) { - memset(beam_state->next_positions.data(), 0, beam_state->next_positions.size_bytes()); gsl::copy(sequence_lengths, beam_state->next_positions); } @@ -274,13 +273,13 @@ Status ProcessLogits(const OrtValue& logits, // // Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1) gsl::span& next_token_scores = beam_state->next_token_scores; ORT_RETURN_IF_ERROR( - SoftmaxCPU( - batch_beam_size, // rows - vocab_size, // elements per row - (input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(), - next_token_scores.data(), - true, - thread_pool)); + SoftmaxCPU( + batch_beam_size, // rows + vocab_size, // elements per row + (input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(), + next_token_scores.data(), + true, + thread_pool)); #ifdef DEBUG_BEAM_SEARCH dumper->Print("next_token_scores after softmax", next_token_scores.data(), batch_size, num_beams, vocab_size); @@ -428,12 +427,12 @@ Status UpdateGptFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper) { + int gpt_subgraph_first_present_output_idx) { // last_outputs: logits, present_0, present_1, ... // next_inputs: input_ids, position_id, attention_mask, past_0, past_1 ORT_UNUSED_PARAMETER(stream); @@ -454,10 +453,12 @@ Status UpdateGptFeeds( } next_inputs[0] = input_ids; - // Update position IDs - int32_t* position_data = position_ids.GetMutable()->MutableData(); - for (int i = 0; i < batch_beam_size; i++) { - position_data[i]++; + if (increase_position) { + // Update position IDs + int32_t* position_data = position_ids.GetMutable()->MutableData(); + for (int i = 0; i < batch_beam_size; i++) { + position_data[i]++; + } } next_inputs[1] = position_ids; @@ -477,14 +478,6 @@ Status UpdateGptFeeds( } next_inputs[2] = attention_mask; -#ifdef DEBUG_BEAM_SEARCH - dumper->Print("input_ids", input_ids); - dumper->Print("position_ids", position_ids); - dumper->Print("attention_mask", attention_mask); -#else - ORT_UNUSED_PARAMETER(dumper); -#endif - // Update past state if (num_beams == 1) { // feed present_* output to past_* inputs one by one @@ -725,12 +718,12 @@ template Status UpdateGptFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper); + int gpt_subgraph_first_present_output_idx); template Status UpdateDecoderFeeds( AllocatorPtr allocator, @@ -751,28 +744,28 @@ template Status UpdateDecoderFeeds( template void ExpandInputs(const OrtValue& input, int num_beams, AllocatorPtr allocator, OrtValue& expanded); template Status ExpandBuffer( - void* stream, - const OrtValue& input, - int num_beams, - AllocatorPtr allocator, - OrtValue& expanded, - bool only_copy_shape); + void* stream, + const OrtValue& input, + int num_beams, + AllocatorPtr allocator, + OrtValue& expanded, + bool only_copy_shape); template Status ExpandBuffer( - void* stream, - const OrtValue& input, - int num_beams, - AllocatorPtr allocator, - OrtValue& expanded, - bool only_copy_shape); + void* stream, + const OrtValue& input, + int num_beams, + AllocatorPtr allocator, + OrtValue& expanded, + bool only_copy_shape); template Status ExpandBuffer( - void* stream, - const OrtValue& input, - int num_beams, - AllocatorPtr allocator, - OrtValue& expanded, - bool only_copy_shape); + void* stream, + const OrtValue& input, + int num_beams, + AllocatorPtr allocator, + OrtValue& expanded, + bool only_copy_shape); } // namespace BeamSearchCpuDeviceHelper } // namespace contrib diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h index ab18eec25cde0..36ab8d8e93a98 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h +++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h @@ -96,12 +96,12 @@ using UpdateGptFeedsFunc = std::function& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper)>; + int gpt_subgraph_first_present_output_idx)>; // Create encoder inputs (for encoder-decoder model like T5). using CreateEncoderInputsFunc = std::function; } // namespace BeamSearchDeviceHelper - // These are CPU specific device helper implementations namespace BeamSearchCpuDeviceHelper { Status TopK( @@ -208,12 +207,12 @@ Status UpdateGptFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper); + int gpt_subgraph_first_present_output_idx); // --------------------------------------------------------------- // Functions for encoder-decoder model like T5 diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h index 9cf5daeba929d..7674c2a781055 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h +++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h @@ -56,6 +56,7 @@ class BeamSearchGpt : public BeamSearchBase { std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices); @@ -93,6 +94,7 @@ Status BeamSearchGpt::UpdateFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices) { return update_feeds_func_(this->temp_space_allocator_, @@ -101,12 +103,12 @@ Status BeamSearchGpt::UpdateFeeds( next_inputs, current_length, position_ids, + increase_position, beam_next_tokens, beam_indices, this->parameters_->num_beams, gpt_subgraph_.GetFirstPastInputIndex(), - gpt_subgraph_.GetFirstPresentOutputIndex(), - this->GetConsoleDumper()); + gpt_subgraph_.GetFirstPresentOutputIndex()); } template @@ -186,11 +188,7 @@ Status BeamSearchGpt::Execute(const FeedsFetchesManager& feeds_fetches_manage #ifdef DEBUG_BEAM_SEARCH const IConsoleDumper* dumper = this->GetConsoleDumper(); - dumper->Print("input_ids", feeds[0]); - dumper->Print("position_ids", feeds[1]); - dumper->Print("attention_mask", feeds[2]); #endif - // Position ids for all iterations except the first. It uses memory buffer owned by next_positions. OrtValue position_ids; int64_t dims[] = {parameters->BatchBeamSize(), 1}; @@ -205,9 +203,19 @@ Status BeamSearchGpt::Execute(const FeedsFetchesManager& feeds_fetches_manage int iteration_counter = 0; while (current_length < parameters->max_length) { iteration_counter++; + #ifdef DEBUG_BEAM_SEARCH auto cur_len = std::to_string(current_length); dumper->Print("***CurrentLength", cur_len, true); + dumper->Print("iteration", iteration_counter, true); + + dumper->Print("input_ids", feeds[0]); + dumper->Print("position_ids", feeds[1]); + dumper->Print("attention_mask", feeds[2]); + for (size_t i = 3; i < feeds.size(); i++) { + dumper->Print("past", static_cast(i) - 3, true); + dumper->Print("", feeds[i]); + } #endif status = utils::ExecuteSubgraph(this->decoder_session_state_, @@ -241,8 +249,11 @@ Status BeamSearchGpt::Execute(const FeedsFetchesManager& feeds_fetches_manage // Prepare inputs for next round of subgraph call. if (current_length < parameters->max_length) { + // For the first iteration, position_ids is initialized as sequence lengths. We can add it to feeds directly. + // For the remaining iterations, we need increase position_ids first, then add it to feeds. + bool increase_position = (iteration_counter > 1); ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length, - position_ids, + position_ids, increase_position, beam_next_tokens.as_span(), beam_indices.as_span())); } diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc index b712908259da1..780e98909c603 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc @@ -258,9 +258,8 @@ Status ProcessLogits(const OrtValue& logits, // // The output will be float for consideration of precision and easy integration with remaining parts. float* Y_data = next_token_scores.data(); - const CudaT* X_data = (input_length == 1 && logits_batch_size == batch_beam_size) ? - logits_data : - reinterpret_cast(next_token_logits.data()); + bool is_single_token = (input_length == 1 && logits_batch_size == batch_beam_size); + const CudaT* X_data = is_single_token ? logits_data : reinterpret_cast(next_token_logits.data()); dispatch_blockwise_softmax_forward( cuda_stream, Y_data, X_data, vocab_size, vocab_size, batch_size * num_beams); @@ -500,12 +499,12 @@ Status UpdateGptFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper) { + int gpt_subgraph_first_present_output_idx) { // Update input_ids with next tokens. int batch_beam_size = static_cast(beam_next_tokens.length()); int64_t dims[] = {batch_beam_size, 1}; @@ -519,7 +518,7 @@ Status UpdateGptFeeds( next_inputs[0] = input_ids; // Update position IDs - int32_t* position_data = position_ids.GetMutable()->MutableData(); + int32_t* position_data = increase_position ? position_ids.GetMutable()->MutableData() : nullptr; next_inputs[1] = position_ids; // Update attention mask @@ -538,14 +537,6 @@ Status UpdateGptFeeds( next_inputs[2] = attention_mask; -#ifdef DEBUG_BEAM_SEARCH - dumper->Print("input_ids", input_ids); - dumper->Print("position_ids", position_ids); - dumper->Print("attention_mask", attention_mask); -#else - ORT_UNUSED_PARAMETER(dumper); -#endif - // Update past state if (num_beams == 1) { const int k = gpt_subgraph_first_past_input_idx - gpt_subgraph_first_present_output_idx; @@ -662,12 +653,12 @@ Status ExpandBuffer(void* stream, for (int i = 0; i < batch_size; i++) { for (int j = 0; j < num_beams; j++) { CUDA_RETURN_IF_ERROR( - cudaMemcpyAsync( - target, - input_data + i * chunk_size, - sizeof(T) * chunk_size, - cudaMemcpyDeviceToDevice, - cuda_stream)); + cudaMemcpyAsync( + target, + input_data + i * chunk_size, + sizeof(T) * chunk_size, + cudaMemcpyDeviceToDevice, + cuda_stream)); target += chunk_size; } } @@ -714,12 +705,12 @@ template Status UpdateGptFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper); + int gpt_subgraph_first_present_output_idx); // Float16 template void InitBeamState(transformers::IBeamSearchState* beam_state, @@ -748,12 +739,12 @@ template Status UpdateGptFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper); + int gpt_subgraph_first_present_output_idx); template Status UpdateDecoderFeeds( AllocatorPtr allocator, diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h index 14f64e923e781..4424fee6d5cb2 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h @@ -68,12 +68,12 @@ Status UpdateGptFeeds( std::vector& next_inputs, int current_length, OrtValue& position_ids, + bool increase_position, gsl::span beam_next_tokens, gsl::span beam_indices, int num_beams, int gpt_subgraph_first_past_input_idx, - int gpt_subgraph_first_present_output_idx, - const transformers::IConsoleDumper* dumper); + int gpt_subgraph_first_present_output_idx); // --------------------------------------------------------------- // Functions for encoder-decoder model like T5 diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu index 4f93b1dded937..6bc52758c7cc3 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu @@ -248,9 +248,11 @@ __global__ void UpdateGptInputsKernel(const T* old_mask_data, int j = index % current_length; mask_data[index] = (j < current_length - 1) ? old_mask_data[i * (current_length - 1) + j] : static_cast(1); - // Update sequence length (or next positions). - if (index < batch_beam_size) { - next_positions[index]++; + if (next_positions != nullptr) { + // Update sequence length (or next positions). + if (index < batch_beam_size) { + next_positions[index]++; + } } } } diff --git a/onnxruntime/core/common/cpuid_arch_definition.h b/onnxruntime/core/common/cpuid_arch_definition.h new file mode 100644 index 0000000000000..a541eb66d8ba3 --- /dev/null +++ b/onnxruntime/core/common/cpuid_arch_definition.h @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// This file defines the CPUIDINFO_ARCH_* symbols. + +#pragma once + +#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__i386__) || defined(__x86_64__) +#define CPUIDINFO_ARCH_X86 +#endif + +#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__) +#define CPUIDINFO_ARCH_ARM +#endif // ARM or ARM64 diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h index f76f0b0a1527c..ff535d889386d 100644 --- a/onnxruntime/core/common/cpuid_info.h +++ b/onnxruntime/core/common/cpuid_info.h @@ -4,14 +4,7 @@ #pragma once #include "core/common/common.h" - -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__i386__) || defined(__x86_64__) -#define CPUIDINFO_ARCH_X86 -#endif - -#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__) -#define CPUIDINFO_ARCH_ARM -#endif // ARM or ARM64 +#include "core/common/cpuid_arch_definition.h" namespace onnxruntime { @@ -31,7 +24,7 @@ class CPUIDInfo { bool HasSSE4_1() const { return has_sse4_1_; } bool IsHybrid() const { return is_hybrid_; } - // ARM + // ARM bool HasArmNeonDot() const { return has_arm_neon_dot_; } uint32_t GetCurrentCoreIdx() const; @@ -72,7 +65,7 @@ class CPUIDInfo { } return is_armv8_narrow_ld_[coreId]; } - + /** * @brief Some ARMv8 power efficient core has narrower 64b load/store * that needs specialized optimiztion in kernels diff --git a/onnxruntime/core/common/cpuid_uarch.cc b/onnxruntime/core/common/cpuid_uarch.cc new file mode 100644 index 0000000000000..e9d8de9732b15 --- /dev/null +++ b/onnxruntime/core/common/cpuid_uarch.cc @@ -0,0 +1,369 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/cpuid_uarch.h" + +#include "core/common/logging/logging.h" + +namespace onnxruntime { + +#if defined(CPUIDINFO_ARCH_ARM) + +#define CPUINFO_ARM_MIDR_IMPLEMENTER_MASK UINT32_C(0xFF000000) +#define CPUINFO_ARM_MIDR_VARIANT_MASK UINT32_C(0x00F00000) +#define CPUINFO_ARM_MIDR_ARCHITECTURE_MASK UINT32_C(0x000F0000) +#define CPUINFO_ARM_MIDR_PART_MASK UINT32_C(0x0000FFF0) +#define CPUINFO_ARM_MIDR_REVISION_MASK UINT32_C(0x0000000F) + +#define CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET 24 +#define CPUINFO_ARM_MIDR_VARIANT_OFFSET 20 +#define CPUINFO_ARM_MIDR_ARCHITECTURE_OFFSET 16 +#define CPUINFO_ARM_MIDR_PART_OFFSET 4 +#define CPUINFO_ARM_MIDR_REVISION_OFFSET 0 + +inline static uint32_t midr_get_implementer(uint32_t midr) { + return (midr & CPUINFO_ARM_MIDR_IMPLEMENTER_MASK) >> CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET; +} + +inline static uint32_t midr_get_part(uint32_t midr) { + return (midr & CPUINFO_ARM_MIDR_PART_MASK) >> CPUINFO_ARM_MIDR_PART_OFFSET; +} + +inline static uint32_t midr_get_variant(uint32_t midr) { + return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET; +} + +void decodeMIDR( + uint32_t midr, + uint32_t uarch[1]) { + switch (midr_get_implementer(midr)) { + case 'A': + switch (midr_get_part(midr)) { + //#if defined(_M_ARM) || defined(__arm__) + case 0xC05: + *uarch = cpuinfo_uarch_cortex_a5; + break; + case 0xC07: + *uarch = cpuinfo_uarch_cortex_a7; + break; + case 0xC08: + *uarch = cpuinfo_uarch_cortex_a8; + break; + case 0xC09: + *uarch = cpuinfo_uarch_cortex_a9; + break; + case 0xC0C: + *uarch = cpuinfo_uarch_cortex_a12; + break; + case 0xC0E: + *uarch = cpuinfo_uarch_cortex_a17; + break; + case 0xC0D: + /* + * Rockchip RK3288 only. + * Core information is ambiguous: some sources specify Cortex-A12, others - Cortex-A17. + * Assume it is Cortex-A12. + */ + *uarch = cpuinfo_uarch_cortex_a12; + break; + case 0xC0F: + *uarch = cpuinfo_uarch_cortex_a15; + break; + //#endif /* ARM */ + case 0xD01: + *uarch = cpuinfo_uarch_cortex_a32; + break; + case 0xD03: + *uarch = cpuinfo_uarch_cortex_a53; + break; + case 0xD04: + *uarch = cpuinfo_uarch_cortex_a35; + break; + case 0xD05: + // Note: use Variant, not Revision, field + *uarch = (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) == 0 ? cpuinfo_uarch_cortex_a55r0 : cpuinfo_uarch_cortex_a55; + break; + case 0xD06: + *uarch = cpuinfo_uarch_cortex_a65; + break; + case 0xD07: + *uarch = cpuinfo_uarch_cortex_a57; + break; + case 0xD08: + *uarch = cpuinfo_uarch_cortex_a72; + break; + case 0xD09: + *uarch = cpuinfo_uarch_cortex_a73; + break; + case 0xD0A: + *uarch = cpuinfo_uarch_cortex_a75; + break; + case 0xD0B: + *uarch = cpuinfo_uarch_cortex_a76; + break; + //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) + case 0xD0C: + *uarch = cpuinfo_uarch_neoverse_n1; + break; + //#endif /* ARM64 && !defined(__ANDROID__) */ + case 0xD0D: + *uarch = cpuinfo_uarch_cortex_a77; + break; + case 0xD0E: /* Cortex-A76AE */ + *uarch = cpuinfo_uarch_cortex_a76; + break; + case 0xD41: /* Cortex-A78 */ + *uarch = cpuinfo_uarch_cortex_a78; + break; + case 0xD44: /* Cortex-X1 */ + *uarch = cpuinfo_uarch_cortex_x1; + break; + //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) + case 0xD4A: + *uarch = cpuinfo_uarch_neoverse_e1; + break; + //#endif /* ARM64 && !defined(__ANDROID__) */ + default: + switch (midr_get_part(midr) >> 8) { + //#if defined(_M_ARM) || defined(__arm__) + case 7: + *uarch = cpuinfo_uarch_arm7; + break; + case 9: + *uarch = cpuinfo_uarch_arm9; + break; + case 11: + *uarch = cpuinfo_uarch_arm11; + break; + //#endif /* ARM */ + default: + LOGS_DEFAULT(WARNING) << "unknown ARM CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + } + break; + case 'B': + switch (midr_get_part(midr)) { + case 0x00F: + *uarch = cpuinfo_uarch_brahma_b15; + break; + case 0x100: + *uarch = cpuinfo_uarch_brahma_b53; + break; + //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) + case 0x516: + /* Broadcom Vulkan was sold to Cavium before it reached the market, so we identify it as Cavium ThunderX2 */ + *uarch = cpuinfo_uarch_thunderx2; + break; + //#endif + default: + LOGS_DEFAULT(WARNING) << "unknown Broadcom CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; + //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) + case 'C': + switch (midr_get_part(midr)) { + case 0x0A0: /* ThunderX */ + case 0x0A1: /* ThunderX 88XX */ + case 0x0A2: /* ThunderX 81XX */ + case 0x0A3: /* ThunderX 83XX */ + *uarch = cpuinfo_uarch_thunderx; + break; + case 0x0AF: /* ThunderX2 99XX */ + *uarch = cpuinfo_uarch_thunderx2; + break; + default: + LOGS_DEFAULT(WARNING) << "unknown Cavium CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; + //#endif + case 'H': + switch (midr_get_part(midr)) { + //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) + case 0xD01: /* Kunpeng 920 series */ + *uarch = cpuinfo_uarch_taishan_v110; + break; + //#endif + case 0xD40: /* Kirin 980 Big/Medium cores -> Cortex-A76 */ + *uarch = cpuinfo_uarch_cortex_a76; + break; + default: + LOGS_DEFAULT(WARNING) << "unknown Huawei CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; + //#if defined(_M_ARM) || defined(__arm__) + case 'i': + switch (midr_get_part(midr) >> 8) { + case 2: /* PXA 210/25X/26X */ + case 4: /* PXA 27X */ + case 6: /* PXA 3XX */ + *uarch = cpuinfo_uarch_xscale; + break; + default: + LOGS_DEFAULT(WARNING) << "unknown Intel CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; + //#endif /* ARM */ + case 'N': + switch (midr_get_part(midr)) { + case 0x000: + *uarch = cpuinfo_uarch_denver; + break; + case 0x003: + *uarch = cpuinfo_uarch_denver2; + break; + case 0x004: + *uarch = cpuinfo_uarch_carmel; + break; + default: + LOGS_DEFAULT(WARNING) << "unknown Nvidia CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; +#if !defined(__ANDROID__) + case 'P': + switch (midr_get_part(midr)) { + case 0x000: + *uarch = cpuinfo_uarch_xgene; + break; + default: + LOGS_DEFAULT(WARNING) << "unknown Applied Micro CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; +#endif + case 'Q': + switch (midr_get_part(midr)) { + // #if defined(_M_ARM) || defined(__arm__) + case 0x00F: + /* Mostly Scorpions, but some Cortex A5 may report this value as well */ + //if (has_vfpv4) { + // /* Unlike Scorpion, Cortex-A5 comes with VFPv4 */ + // *vendor = cpuinfo_vendor_arm; + // *uarch = cpuinfo_uarch_cortex_a5; + //} else { + *uarch = cpuinfo_uarch_scorpion; + // } + break; + case 0x02D: /* Dual-core Scorpions */ + *uarch = cpuinfo_uarch_scorpion; + break; + case 0x04D: + /* + * Dual-core Krait: + * - r1p0 -> Krait 200 + * - r1p4 -> Krait 200 + * - r2p0 -> Krait 300 + */ + case 0x06F: + /* + * Quad-core Krait: + * - r0p1 -> Krait 200 + * - r0p2 -> Krait 200 + * - r1p0 -> Krait 300 + * - r2p0 -> Krait 400 (Snapdragon 800 MSMxxxx) + * - r2p1 -> Krait 400 (Snapdragon 801 MSMxxxxPRO) + * - r3p1 -> Krait 450 + */ + *uarch = cpuinfo_uarch_krait; + break; + //#endif /* ARM */ + case 0x201: /* Qualcomm Snapdragon 821: Low-power Kryo "Silver" */ + case 0x205: /* Qualcomm Snapdragon 820 & 821: High-performance Kryo "Gold" */ + case 0x211: /* Qualcomm Snapdragon 820: Low-power Kryo "Silver" */ + *uarch = cpuinfo_uarch_kryo; + break; + case 0x800: /* High-performance Kryo 260 (r10p2) / Kryo 280 (r10p1) "Gold" -> Cortex-A73 */ + *uarch = cpuinfo_uarch_cortex_a73; + break; + case 0x801: /* Low-power Kryo 260 / 280 "Silver" -> Cortex-A53 */ + *uarch = cpuinfo_uarch_cortex_a53; + break; + case 0x802: /* High-performance Kryo 385 "Gold" -> Cortex-A75 */ + *uarch = cpuinfo_uarch_cortex_a75; + break; + case 0x803: /* Low-power Kryo 385 "Silver" -> Cortex-A55r0 */ + *uarch = cpuinfo_uarch_cortex_a55r0; + break; + case 0x804: /* High-performance Kryo 485 "Gold" / "Gold Prime" -> Cortex-A76 */ + *uarch = cpuinfo_uarch_cortex_a76; + break; + case 0x805: /* Low-performance Kryo 485 "Silver" -> Cortex-A55 */ + *uarch = cpuinfo_uarch_cortex_a55; + break; + //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) + case 0xC00: + *uarch = cpuinfo_uarch_falkor; + break; + case 0xC01: + *uarch = cpuinfo_uarch_saphira; + break; + //#endif /* ARM64 && !defined(__ANDROID__) */ + default: + LOGS_DEFAULT(WARNING) << "unknown Qualcomm CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; + case 'S': + switch (midr & (CPUINFO_ARM_MIDR_VARIANT_MASK | CPUINFO_ARM_MIDR_PART_MASK)) { + case 0x00100010: + /* + * Exynos 8890 MIDR = 0x531F0011, assume Exynos M1 has: + * - CPU variant 0x1 + * - CPU part 0x001 + */ + *uarch = cpuinfo_uarch_exynos_m1; + break; + case 0x00400010: + /* + * Exynos 8895 MIDR = 0x534F0010, assume Exynos M2 has: + * - CPU variant 0x4 + * - CPU part 0x001 + */ + *uarch = cpuinfo_uarch_exynos_m2; + break; + case 0x00100020: + /* + * Exynos 9810 MIDR = 0x531F0020, assume Exynos M3 has: + * - CPU variant 0x1 + * - CPU part 0x002 + */ + *uarch = cpuinfo_uarch_exynos_m3; + break; + case 0x00100030: + /* + * Exynos 9820 MIDR = 0x531F0030, assume Exynos M4 has: + * - CPU variant 0x1 + * - CPU part 0x003 + */ + *uarch = cpuinfo_uarch_exynos_m4; + break; + case 0x00100040: + /* + * Exynos 9820 MIDR = 0x531F0040, assume Exynos M5 has: + * - CPU variant 0x1 + * - CPU part 0x004 + */ + *uarch = cpuinfo_uarch_exynos_m5; + break; + default: + LOGS_DEFAULT(WARNING) << "unknown Samsung CPU variant 0x" + << std::hex << midr_get_variant(midr) << " part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; + //#if defined(_M_ARM) || defined(__arm__) + case 'V': + switch (midr_get_part(midr)) { + case 0x581: /* PJ4 / PJ4B */ + case 0x584: /* PJ4B-MP / PJ4C */ + *uarch = cpuinfo_uarch_pj4; + break; + default: + LOGS_DEFAULT(WARNING) << "unknown Marvell CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; + } + break; + //#endif /* ARM */ + default: + LOGS_DEFAULT(WARNING) << "unknown CPU uarch from MIDR value: 0x" << std::hex << midr; + } +} + +#endif // arm or arm64 + +} // namespace onnxruntime diff --git a/onnxruntime/core/common/cpuid_uarch.h b/onnxruntime/core/common/cpuid_uarch.h index 73a05c4b91a3a..c781561292209 100644 --- a/onnxruntime/core/common/cpuid_uarch.h +++ b/onnxruntime/core/common/cpuid_uarch.h @@ -22,6 +22,14 @@ Module Name: --*/ +#pragma once + +#include + +#include "core/common/cpuid_arch_definition.h" + +namespace onnxruntime { + enum CPUIDINFOuarch { /** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */ cpuinfo_uarch_unknown = 0, @@ -175,359 +183,8 @@ enum CPUIDINFOuarch { #if defined(CPUIDINFO_ARCH_ARM) -#define CPUINFO_ARM_MIDR_IMPLEMENTER_MASK UINT32_C(0xFF000000) -#define CPUINFO_ARM_MIDR_VARIANT_MASK UINT32_C(0x00F00000) -#define CPUINFO_ARM_MIDR_ARCHITECTURE_MASK UINT32_C(0x000F0000) -#define CPUINFO_ARM_MIDR_PART_MASK UINT32_C(0x0000FFF0) -#define CPUINFO_ARM_MIDR_REVISION_MASK UINT32_C(0x0000000F) - -#define CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET 24 -#define CPUINFO_ARM_MIDR_VARIANT_OFFSET 20 -#define CPUINFO_ARM_MIDR_ARCHITECTURE_OFFSET 16 -#define CPUINFO_ARM_MIDR_PART_OFFSET 4 -#define CPUINFO_ARM_MIDR_REVISION_OFFSET 0 - -inline static uint32_t midr_get_implementer(uint32_t midr) { - return (midr & CPUINFO_ARM_MIDR_IMPLEMENTER_MASK) >> CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET; -} - -inline static uint32_t midr_get_part(uint32_t midr) { - return (midr & CPUINFO_ARM_MIDR_PART_MASK) >> CPUINFO_ARM_MIDR_PART_OFFSET; -} - -inline static uint32_t midr_get_variant(uint32_t midr) { - return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET; -} - -static void decodeMIDR( - uint32_t midr, - uint32_t uarch[1]) { - switch (midr_get_implementer(midr)) { - case 'A': - switch (midr_get_part(midr)) { - //#if defined(_M_ARM) || defined(__arm__) - case 0xC05: - *uarch = cpuinfo_uarch_cortex_a5; - break; - case 0xC07: - *uarch = cpuinfo_uarch_cortex_a7; - break; - case 0xC08: - *uarch = cpuinfo_uarch_cortex_a8; - break; - case 0xC09: - *uarch = cpuinfo_uarch_cortex_a9; - break; - case 0xC0C: - *uarch = cpuinfo_uarch_cortex_a12; - break; - case 0xC0E: - *uarch = cpuinfo_uarch_cortex_a17; - break; - case 0xC0D: - /* - * Rockchip RK3288 only. - * Core information is ambiguous: some sources specify Cortex-A12, others - Cortex-A17. - * Assume it is Cortex-A12. - */ - *uarch = cpuinfo_uarch_cortex_a12; - break; - case 0xC0F: - *uarch = cpuinfo_uarch_cortex_a15; - break; - //#endif /* ARM */ - case 0xD01: - *uarch = cpuinfo_uarch_cortex_a32; - break; - case 0xD03: - *uarch = cpuinfo_uarch_cortex_a53; - break; - case 0xD04: - *uarch = cpuinfo_uarch_cortex_a35; - break; - case 0xD05: - // Note: use Variant, not Revision, field - *uarch = (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) == 0 ? cpuinfo_uarch_cortex_a55r0 : cpuinfo_uarch_cortex_a55; - break; - case 0xD06: - *uarch = cpuinfo_uarch_cortex_a65; - break; - case 0xD07: - *uarch = cpuinfo_uarch_cortex_a57; - break; - case 0xD08: - *uarch = cpuinfo_uarch_cortex_a72; - break; - case 0xD09: - *uarch = cpuinfo_uarch_cortex_a73; - break; - case 0xD0A: - *uarch = cpuinfo_uarch_cortex_a75; - break; - case 0xD0B: - *uarch = cpuinfo_uarch_cortex_a76; - break; - //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) - case 0xD0C: - *uarch = cpuinfo_uarch_neoverse_n1; - break; - //#endif /* ARM64 && !defined(__ANDROID__) */ - case 0xD0D: - *uarch = cpuinfo_uarch_cortex_a77; - break; - case 0xD0E: /* Cortex-A76AE */ - *uarch = cpuinfo_uarch_cortex_a76; - break; - case 0xD41: /* Cortex-A78 */ - *uarch = cpuinfo_uarch_cortex_a78; - break; - case 0xD44: /* Cortex-X1 */ - *uarch = cpuinfo_uarch_cortex_x1; - break; - //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) - case 0xD4A: - *uarch = cpuinfo_uarch_neoverse_e1; - break; - //#endif /* ARM64 && !defined(__ANDROID__) */ - default: - switch (midr_get_part(midr) >> 8) { - //#if defined(_M_ARM) || defined(__arm__) - case 7: - *uarch = cpuinfo_uarch_arm7; - break; - case 9: - *uarch = cpuinfo_uarch_arm9; - break; - case 11: - *uarch = cpuinfo_uarch_arm11; - break; - //#endif /* ARM */ - default: - LOGS_DEFAULT(WARNING) << "unknown ARM CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - } - break; - case 'B': - switch (midr_get_part(midr)) { - case 0x00F: - *uarch = cpuinfo_uarch_brahma_b15; - break; - case 0x100: - *uarch = cpuinfo_uarch_brahma_b53; - break; - //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) - case 0x516: - /* Broadcom Vulkan was sold to Cavium before it reached the market, so we identify it as Cavium ThunderX2 */ - *uarch = cpuinfo_uarch_thunderx2; - break; - //#endif - default: - LOGS_DEFAULT(WARNING) << "unknown Broadcom CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; - //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) - case 'C': - switch (midr_get_part(midr)) { - case 0x0A0: /* ThunderX */ - case 0x0A1: /* ThunderX 88XX */ - case 0x0A2: /* ThunderX 81XX */ - case 0x0A3: /* ThunderX 83XX */ - *uarch = cpuinfo_uarch_thunderx; - break; - case 0x0AF: /* ThunderX2 99XX */ - *uarch = cpuinfo_uarch_thunderx2; - break; - default: - LOGS_DEFAULT(WARNING) << "unknown Cavium CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; - //#endif - case 'H': - switch (midr_get_part(midr)) { - //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) - case 0xD01: /* Kunpeng 920 series */ - *uarch = cpuinfo_uarch_taishan_v110; - break; - //#endif - case 0xD40: /* Kirin 980 Big/Medium cores -> Cortex-A76 */ - *uarch = cpuinfo_uarch_cortex_a76; - break; - default: - LOGS_DEFAULT(WARNING) << "unknown Huawei CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; - //#if defined(_M_ARM) || defined(__arm__) - case 'i': - switch (midr_get_part(midr) >> 8) { - case 2: /* PXA 210/25X/26X */ - case 4: /* PXA 27X */ - case 6: /* PXA 3XX */ - *uarch = cpuinfo_uarch_xscale; - break; - default: - LOGS_DEFAULT(WARNING) << "unknown Intel CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; - //#endif /* ARM */ - case 'N': - switch (midr_get_part(midr)) { - case 0x000: - *uarch = cpuinfo_uarch_denver; - break; - case 0x003: - *uarch = cpuinfo_uarch_denver2; - break; - case 0x004: - *uarch = cpuinfo_uarch_carmel; - break; - default: - LOGS_DEFAULT(WARNING) << "unknown Nvidia CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; -#if !defined(__ANDROID__) - case 'P': - switch (midr_get_part(midr)) { - case 0x000: - *uarch = cpuinfo_uarch_xgene; - break; - default: - LOGS_DEFAULT(WARNING) << "unknown Applied Micro CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; -#endif - case 'Q': - switch (midr_get_part(midr)) { - // #if defined(_M_ARM) || defined(__arm__) - case 0x00F: - /* Mostly Scorpions, but some Cortex A5 may report this value as well */ - //if (has_vfpv4) { - // /* Unlike Scorpion, Cortex-A5 comes with VFPv4 */ - // *vendor = cpuinfo_vendor_arm; - // *uarch = cpuinfo_uarch_cortex_a5; - //} else { - *uarch = cpuinfo_uarch_scorpion; - // } - break; - case 0x02D: /* Dual-core Scorpions */ - *uarch = cpuinfo_uarch_scorpion; - break; - case 0x04D: - /* - * Dual-core Krait: - * - r1p0 -> Krait 200 - * - r1p4 -> Krait 200 - * - r2p0 -> Krait 300 - */ - case 0x06F: - /* - * Quad-core Krait: - * - r0p1 -> Krait 200 - * - r0p2 -> Krait 200 - * - r1p0 -> Krait 300 - * - r2p0 -> Krait 400 (Snapdragon 800 MSMxxxx) - * - r2p1 -> Krait 400 (Snapdragon 801 MSMxxxxPRO) - * - r3p1 -> Krait 450 - */ - *uarch = cpuinfo_uarch_krait; - break; - //#endif /* ARM */ - case 0x201: /* Qualcomm Snapdragon 821: Low-power Kryo "Silver" */ - case 0x205: /* Qualcomm Snapdragon 820 & 821: High-performance Kryo "Gold" */ - case 0x211: /* Qualcomm Snapdragon 820: Low-power Kryo "Silver" */ - *uarch = cpuinfo_uarch_kryo; - break; - case 0x800: /* High-performance Kryo 260 (r10p2) / Kryo 280 (r10p1) "Gold" -> Cortex-A73 */ - *uarch = cpuinfo_uarch_cortex_a73; - break; - case 0x801: /* Low-power Kryo 260 / 280 "Silver" -> Cortex-A53 */ - *uarch = cpuinfo_uarch_cortex_a53; - break; - case 0x802: /* High-performance Kryo 385 "Gold" -> Cortex-A75 */ - *uarch = cpuinfo_uarch_cortex_a75; - break; - case 0x803: /* Low-power Kryo 385 "Silver" -> Cortex-A55r0 */ - *uarch = cpuinfo_uarch_cortex_a55r0; - break; - case 0x804: /* High-performance Kryo 485 "Gold" / "Gold Prime" -> Cortex-A76 */ - *uarch = cpuinfo_uarch_cortex_a76; - break; - case 0x805: /* Low-performance Kryo 485 "Silver" -> Cortex-A55 */ - *uarch = cpuinfo_uarch_cortex_a55; - break; - //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__) - case 0xC00: - *uarch = cpuinfo_uarch_falkor; - break; - case 0xC01: - *uarch = cpuinfo_uarch_saphira; - break; - //#endif /* ARM64 && !defined(__ANDROID__) */ - default: - LOGS_DEFAULT(WARNING) << "unknown Qualcomm CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; - case 'S': - switch (midr & (CPUINFO_ARM_MIDR_VARIANT_MASK | CPUINFO_ARM_MIDR_PART_MASK)) { - case 0x00100010: - /* - * Exynos 8890 MIDR = 0x531F0011, assume Exynos M1 has: - * - CPU variant 0x1 - * - CPU part 0x001 - */ - *uarch = cpuinfo_uarch_exynos_m1; - break; - case 0x00400010: - /* - * Exynos 8895 MIDR = 0x534F0010, assume Exynos M2 has: - * - CPU variant 0x4 - * - CPU part 0x001 - */ - *uarch = cpuinfo_uarch_exynos_m2; - break; - case 0x00100020: - /* - * Exynos 9810 MIDR = 0x531F0020, assume Exynos M3 has: - * - CPU variant 0x1 - * - CPU part 0x002 - */ - *uarch = cpuinfo_uarch_exynos_m3; - break; - case 0x00100030: - /* - * Exynos 9820 MIDR = 0x531F0030, assume Exynos M4 has: - * - CPU variant 0x1 - * - CPU part 0x003 - */ - *uarch = cpuinfo_uarch_exynos_m4; - break; - case 0x00100040: - /* - * Exynos 9820 MIDR = 0x531F0040, assume Exynos M5 has: - * - CPU variant 0x1 - * - CPU part 0x004 - */ - *uarch = cpuinfo_uarch_exynos_m5; - break; - default: - LOGS_DEFAULT(WARNING) << "unknown Samsung CPU variant 0x" - << std::hex << midr_get_variant(midr) << " part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; - //#if defined(_M_ARM) || defined(__arm__) - case 'V': - switch (midr_get_part(midr)) { - case 0x581: /* PJ4 / PJ4B */ - case 0x584: /* PJ4B-MP / PJ4C */ - *uarch = cpuinfo_uarch_pj4; - break; - default: - LOGS_DEFAULT(WARNING) << "unknown Marvell CPU part 0x" << std::hex << midr_get_part(midr) << " ignored"; - } - break; - //#endif /* ARM */ - default: - LOGS_DEFAULT(WARNING) << "unknown CPU uarch from MIDR value: 0x" << std::hex << midr; - } -} +void decodeMIDR(uint32_t midr, uint32_t uarch[1]); #endif // arm or arm64 + +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc index 6cdb63319a5bf..0e996ec98a730 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc @@ -2001,7 +2001,9 @@ OptimizeResult OptimizeImpl(OptimizerCtx& ctx) { continue; } - if (!HandleQuantizeDequantizeScale(ctx.graph, *perm, *dq_node, ctx.opset)) { + // we're moving the Transpose to before the DQ, so we need to use the inverse permutations to update the axis + // attribute correctly when doing per-axis dequantization + if (!HandleQuantizeDequantizeScale(ctx.graph, InvertPerm(*perm), *dq_node, ctx.opset)) { continue; } diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 4358ddc7c3e26..c18075cc7b4b9 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -40,6 +40,23 @@ namespace onnxruntime { namespace { +class UnmapFileParam { + public: + void* addr; + size_t len; +}; + +static void UnmapFile(void* param) noexcept { + UnmapFileParam* p = reinterpret_cast(param); + bool ret = UnmapViewOfFile(p->addr); + if (!ret) { + const auto error_code = GetLastError(); + LOGS_DEFAULT(ERROR) << "unmap view of file failed. error code: " << error_code + << " error msg: " << std::system_category().message(error_code); + } + delete p; +} + std::wstring Basename(const std::wstring& path) { auto basename_index = path.find_last_of(L"/\\") + 1; // results in 0 if no separator is found return path.substr(basename_index); @@ -320,8 +337,95 @@ class WindowsEnv : public Env { return Status::OK(); } + /** Status MapFileIntoMemory(_In_z_ const ORTCHAR_T*, FileOffsetType, size_t, MappedMemoryPtr&) const override { return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "MapFileIntoMemory is not implemented on Windows."); + }*/ + + Status MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path, + FileOffsetType offset, + size_t length, + MappedMemoryPtr& mapped_memory) const override { + ORT_RETURN_IF_NOT(file_path, "file_path == nullptr"); + ORT_RETURN_IF_NOT(offset >= 0, "offset < 0"); + + if (length == 0) { + mapped_memory = MappedMemoryPtr{}; + return Status::OK(); + } + +#if WINVER >= _WIN32_WINNT_WIN8 + wil::unique_hfile file_handle{ + CreateFile2(file_path, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL)}; +#else + wil::unique_hfile file_handle{ + CreateFileW(file_path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)}; +#endif + if (file_handle.get() == INVALID_HANDLE_VALUE) { + const auto error_code = GetLastError(); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "open file ", ToUTF8String(Basename(file_path)), + " fail, errcode = ", error_code, + " - ", std::system_category().message(error_code)); + } + +#if NTDDI_VERSION >= NTDDI_WIN10_RS5 + wil::unique_hfile file_mapping_handle{ + CreateFileMapping2(file_handle.get(), + nullptr, + FILE_MAP_READ, + PAGE_READONLY, + SEC_COMMIT, + 0, + nullptr, + nullptr, + 0)}; +#else + wil::unique_hfile file_mapping_handle{ + CreateFileMappingW(file_handle.get(), + nullptr, + PAGE_READONLY, + 0, + 0, + nullptr)}; +#endif + if (file_mapping_handle.get() == INVALID_HANDLE_VALUE) { + const auto error_code = GetLastError(); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "open file mapping ", ToUTF8String(Basename(file_path)), + " fail, errcode = ", error_code, + " - ", std::system_category().message(error_code)); + } + + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + + static const DWORD page_size = sysinfo.dwPageSize; + static const DWORD allocation_granularity = sysinfo.dwAllocationGranularity; + const FileOffsetType offset_to_page = offset % static_cast(page_size); + const size_t mapped_length = length + static_cast(offset_to_page); + const FileOffsetType mapped_offset = offset - offset_to_page; + if (mapped_offset % allocation_granularity != 0) { + const auto error_code = GetLastError(); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "mapped offset must be a multiple of the allocation granularity", + " , mapped_offset = ", mapped_offset, + " , allocation_granularity = ", allocation_granularity, + " , errcode = ", error_code, + " - ", std::system_category().message(error_code)); + } + + void* const mapped_base = MapViewOfFile(file_mapping_handle.get(), + FILE_MAP_READ, + 0, + static_cast(mapped_offset), + mapped_length); + + mapped_memory = + MappedMemoryPtr{reinterpret_cast(mapped_base) + offset_to_page, + OrtCallbackInvoker{OrtCallback{UnmapFile, new UnmapFileParam{mapped_base, mapped_length}}}}; + + return Status::OK(); } bool FolderExists(const std::wstring& path) const override { diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.h b/onnxruntime/core/providers/cpu/object_detection/roialign.h index 9ba7f89caf4d4..1bb8bd34c5cb2 100644 --- a/onnxruntime/core/providers/cpu/object_detection/roialign.h +++ b/onnxruntime/core/providers/cpu/object_detection/roialign.h @@ -29,7 +29,6 @@ class RoiAlignBase { } else { ORT_THROW("Invalid mode of value ", mode, " specified. It should be either avg or max"); } - mode_ = mode == "avg" ? RoiAlignMode::avg : RoiAlignMode::max; } // output_height @@ -64,6 +63,13 @@ class RoiAlignBase { else half_pixel_ = false; } + + if (mode_ == RoiAlignMode::max && sampling_ratio_ != 1) { + // TODO(fdwr): Issue #6146. ORT 1.13 will correct the incorrect summation of max mode with PR #7354. + LOGS_DEFAULT(WARNING) << "The existing summation for max mode and sampling ratios besides 1 is incorrect " + << "and will be fixed in the next ORT 1.13 release. Thus the results of RoiAlign " + << "will be different."; + } } protected: diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc index 68e2be1b34cf7..cfeb9a220277e 100644 --- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc +++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc @@ -5,6 +5,8 @@ #include #include "gtest/gtest.h" +#include "gmock/gmock.h" + #include "graph_transform_test_builder.h" #include "core/graph/graph.h" @@ -3620,7 +3622,6 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearTransposePropagation) { EXPECT_EQ(op_types_in_order, expected_op_types_in_order); }; - TransformerTester(build_test_case_1, check_graph, TransformerLevel::Default, @@ -4047,5 +4048,41 @@ TEST(TransposeOptimizerTests, RegressionTest_GitHubIssue10305) { ASSERT_STATUS_OK(session_object.Load(model_uri)); ASSERT_STATUS_OK(session_object.Initialize()); // optimizers run during initialization } + +// regression test for a model with DQ node with per-axis dequantization followed by a Transpose. +// the second phase can swap those around, but needs to use the correct perms for updating the 'axis' +// attribute in the DQ node. +// see https://github.com/microsoft/onnxruntime/issues/12151 for more details. +TEST(TransposeOptimizerTests, RegressionTest_GitHubIssue12151) { + Status status; + auto model_uri = ORT_TSTR("testdata/ort_github_issue_12151.onnx"); + + NameMLValMap feeds; // no inputs for this model + std::vector output_names{"Z"}; + std::vector fetches_orig; + std::vector fetches; + + SessionOptions so; + so.session_logid = "TransposeOptimizerTests.RegressionTest_GitHubIssue12151"; + + { + so.graph_optimization_level = TransformerLevel::Default; // off + InferenceSession session_object{so, GetEnvironment()}; + ASSERT_STATUS_OK(session_object.Load(model_uri)); + ASSERT_STATUS_OK(session_object.Initialize()); + ASSERT_STATUS_OK(session_object.Run(feeds, output_names, &fetches_orig)); + } + + { + so.graph_optimization_level = TransformerLevel::Level1; // enable transpose optimizer + InferenceSession session_object{so, GetEnvironment()}; + ASSERT_STATUS_OK(session_object.Load(model_uri)); + ASSERT_STATUS_OK(session_object.Initialize()); + ASSERT_STATUS_OK(session_object.Run(feeds, output_names, &fetches)); + } + + ASSERT_THAT(fetches_orig[0].Get().DataAsSpan(), + testing::ContainerEq(fetches[0].Get().DataAsSpan())); +} } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc index e4670583f11b9..9dec13767ef90 100644 --- a/onnxruntime/test/platform/file_io_test.cc +++ b/onnxruntime/test/platform/file_io_test.cc @@ -10,6 +10,8 @@ #ifndef _WIN32 #include // for sysconf() and _SC_PAGESIZE +#else +#include #endif #include "gsl/gsl" @@ -61,7 +63,11 @@ std::vector GenerateData(size_t length, uint32_t seed = 0) { } void WriteDataToFile(gsl::span data, const PathString& path) { +#ifndef _WIN32 std::ofstream out{path, std::ios_base::out | std::ios_base::trunc}; +#else + std::ofstream out{path, std::ios_base::out | std::ios_base::trunc | std::ios_base::binary}; +#endif out.write(data.data(), data.size()); } @@ -144,6 +150,59 @@ TEST(FileIoTest, MapFileIntoMemory) { ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK()); } } +#else +TEST(FileIoTest, MapFileIntoMemory) { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + static const auto page_size = sysinfo.dwPageSize; + static const auto allocation_granularity = sysinfo.dwAllocationGranularity; + ASSERT_GT(page_size, static_cast(0)); + + TempFilePath tmp(ORT_TSTR("map_file_test_")); + const auto expected_data = GenerateData(page_size * 3 / 2); + WriteDataToFile(gsl::make_span(expected_data), tmp.path); + + const auto offsets_and_lengths = GenerateValidOffsetLengthPairs( + 0, expected_data.size(), page_size / 10); + + for (const auto& offset_and_length : offsets_and_lengths) { + const auto offset = offset_and_length.first; + const auto length = offset_and_length.second; + + // The offset must be a multiple of the allocation granularity + if (offset % allocation_granularity != 0) { + continue; + } + + Env::MappedMemoryPtr mapped_memory{}; + auto status = Env::Default().MapFileIntoMemory( + tmp.path.c_str(), offset, length, mapped_memory); + ASSERT_TRUE(status.IsOK()) + << "MapFileIntoMemory failed for offset " << offset << " and length " << length + << " with error: " << status.ErrorMessage(); + + auto mapped_span = gsl::make_span(mapped_memory.get(), length); + + auto expected_data_span = gsl::make_span(expected_data.data() + offset, length); + + ASSERT_EQ(mapped_span, expected_data_span); + } + + { + Env::MappedMemoryPtr mapped_memory{}; + + // invalid - offset is not a multiple of the allocation granularity + ASSERT_FALSE(Env::Default().MapFileIntoMemory( + tmp.path.c_str(), allocation_granularity * 3 / 2, page_size / 10, mapped_memory).IsOK()); + } + + { + Env::MappedMemoryPtr mapped_memory{}; + + // invalid - negative offset + ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK()); + } +} #endif } // namespace test diff --git a/onnxruntime/test/testdata/ort_github_issue_12151.onnx b/onnxruntime/test/testdata/ort_github_issue_12151.onnx new file mode 100644 index 0000000000000..f796b46f1bdc2 Binary files /dev/null and b/onnxruntime/test/testdata/ort_github_issue_12151.onnx differ diff --git a/orttraining/orttraining/python/training/optim/_ds_modifier.py b/orttraining/orttraining/python/training/optim/_ds_modifier.py index d9515041f5dfa..6ae6ccee51184 100644 --- a/orttraining/orttraining/python/training/optim/_ds_modifier.py +++ b/orttraining/orttraining/python/training/optim/_ds_modifier.py @@ -10,10 +10,11 @@ # - has_overflow_partitioned_grads_serial : https://github.com/microsoft/DeepSpeed/blob/d8e9ef6f99e27bb95e10bd146d145b3372b4cfda/deepspeed/runtime/zero/stage2.py#L1799 # -------------------------------------------------------------------------- -import torch import types import warnings from distutils.version import LooseVersion + +import torch from numpy import inf from ._modifier import FP16OptimizerModifier, check_overflow, check_overflow_for_grads @@ -27,14 +28,11 @@ def __init__(self, optimizer, **kwargs) -> None: super().__init__(optimizer) def can_be_modified(self): - try: - import deepspeed - - v = LooseVersion(deepspeed.__version__) - if v > LooseVersion("0.5.4") or v < LooseVersion("0.4.0"): - warnings.warn("Unsupported DeepSpeed version to override, skipped.", UserWarning) - return False - except Exception as _: + import deepspeed + + ds_version = LooseVersion(deepspeed.__version__) + if ds_version > LooseVersion("0.6.5") or ds_version < LooseVersion("0.4.0"): + warnings.warn("Skip modifying optimizer because of unsupported DeepSpeed version.", UserWarning) return False return self.check_requirements( @@ -141,7 +139,8 @@ def has_overflow_partitioned_grads_serial(target): #### END OF THE ORIGINAL IMPLEMENTATION #### #### THIS IS THE FASTER IMPLEMENTATION #### - for i in range(len(target.fp16_groups)): + groups = target.fp16_groups if hasattr(target, "fp16_groups") else target.bit16_groups + for i in range(len(groups)): grad_data = [grad.data for grad in target.averaged_gradients[i] if grad is not None] if check_overflow_for_grads(grad_data): return True diff --git a/orttraining/orttraining/python/training/optim/_modifier.py b/orttraining/orttraining/python/training/optim/_modifier.py index 9897ed41210e6..b3ad73110d34a 100644 --- a/orttraining/orttraining/python/training/optim/_modifier.py +++ b/orttraining/orttraining/python/training/optim/_modifier.py @@ -9,6 +9,7 @@ # -------------------------------------------------------------------------- import torch +import warnings from numpy import inf from ._multi_tensor_apply import MultiTensorApply @@ -32,12 +33,16 @@ def check_requirements(self, required_funcs, require_apex=False, require_torch_n if require_torch_non_finite_check is True: _ = torch._amp_foreach_non_finite_check_and_unscale_ except Exception as _: + warnings.warn("Skip modifying optimizer because of Apex or torch_non_finite_check not found.", UserWarning) return False if required_funcs: for func_name in required_funcs: func = getattr(self._optimizer, func_name, None) if not func or not callable(func): + warnings.warn( + "Skip modifying optimizer because of specific function not found in optimizer.", UserWarning + ) return False return True diff --git a/orttraining/orttraining/python/training/optim/_modifier_registry.py b/orttraining/orttraining/python/training/optim/_modifier_registry.py index 142999f3f72c7..4291b792a4607 100644 --- a/orttraining/orttraining/python/training/optim/_modifier_registry.py +++ b/orttraining/orttraining/python/training/optim/_modifier_registry.py @@ -7,12 +7,9 @@ from ._megatron_modifier import LegacyMegatronLMModifier from ._apex_amp_modifier import ApexAMPModifier -LEAGCY_MEGATRON_LM_OPTIMIZER_NAME = "megatron.fp16.fp16.FP16_Optimizer" -DEEPSPEED_ZERO1_AND_ZERO2_OPTIMIZER_NAME = "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer" -APEX_AMP_OPTIMIZER_NAME = "apex.amp.optimizer.unique_name_as_id" - OptimizerModifierTypeRegistry = { - LEAGCY_MEGATRON_LM_OPTIMIZER_NAME: LegacyMegatronLMModifier, - DEEPSPEED_ZERO1_AND_ZERO2_OPTIMIZER_NAME: DeepSpeedZeROModifier, - APEX_AMP_OPTIMIZER_NAME: ApexAMPModifier, + "megatron.fp16.fp16.FP16_Optimizer": LegacyMegatronLMModifier, + "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer": DeepSpeedZeROModifier, + "deepspeed.runtime.zero.stage_1_and_2.DeepSpeedZeroOptimizer": DeepSpeedZeROModifier, + "apex.amp.optimizer.unique_name_as_id": ApexAMPModifier, } diff --git a/orttraining/orttraining/python/training/optim/fp16_optimizer.py b/orttraining/orttraining/python/training/optim/fp16_optimizer.py index c4c353249f1ee..c3864ea711f24 100644 --- a/orttraining/orttraining/python/training/optim/fp16_optimizer.py +++ b/orttraining/orttraining/python/training/optim/fp16_optimizer.py @@ -3,6 +3,8 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- +import warnings + from ._modifier_registry import OptimizerModifierTypeRegistry @@ -90,6 +92,7 @@ def get_full_qualified_type_name(o): optimizer_full_qualified_name = get_full_qualified_type_name(optimizer) if optimizer_full_qualified_name not in OptimizerModifierTypeRegistry: + warnings.warn("Skip modifying optimizer because of optimizer name not found in registry.", UserWarning) return optimizer modifier = OptimizerModifierTypeRegistry[optimizer_full_qualified_name](optimizer, **kwargs) diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py index 63af43ce48eb7..1459d3b86dcdb 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py @@ -4,14 +4,17 @@ # -------------------------------------------------------------------------- import sys +import warnings + import torch import torch.utils.checkpoint -import warnings +from packaging import version from torch.onnx import symbolic_helper from onnxruntime.capi._pybind_state import register_torch_autograd_function -from ._fallback import _FallbackManager, ORTModuleONNXModelException, ORTModuleTorchModelException, wrap_exception + from . import _logger +from ._fallback import ORTModuleONNXModelException, ORTModuleTorchModelException, _FallbackManager, wrap_exception # Some autograd.Function's shouldn't be exported as PythonOp. # If CheckpointFunction is exported as PythonOp, the checkpointed computation @@ -37,7 +40,15 @@ def _export_pt_1_10(g, n, *args, **kwargs): "wrap exportable sub-nn.Module's as ORTModule." ) inplace = kwargs["inplace"] - training_mode = symbolic_helper._training_mode + # TODO move to public API once exporter team exposes that + training_mode = None + runtime_pytorch_version = version.parse(torch.__version__.split("+")[0]) + if runtime_pytorch_version > version.parse("1.11"): + from torch.onnx import _globals + + training_mode = _globals.GLOBALS.training_mode + else: + training_mode = symbolic_helper._training_mode cconv = n.cconv() input_tensor_types = [] input_requires_grads = [] diff --git a/requirements-training.txt b/requirements-training.txt index 4b1be6cef9b7c..82f0331314da6 100644 --- a/requirements-training.txt +++ b/requirements-training.txt @@ -4,6 +4,6 @@ h5py numpy >= 1.16.6 onnx packaging -protobuf +protobuf >= 3.12.2, <= 3.20.1 sympy setuptools>=41.4.0 diff --git a/setup.py b/setup.py index db13319174440..0805a593723cb 100644 --- a/setup.py +++ b/setup.py @@ -463,10 +463,10 @@ def finalize_options(self): "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ] if not enable_training: diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index f7567fbee8192..08cccff5e3033 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -875,7 +875,7 @@ def generate_build_tree( "-Donnxruntime_ENABLE_TRAINING_TORCH_INTEROP=" + ("ON" if args.enable_training_torch_interop else "OFF"), # Enable advanced computations such as AVX for some traininig related ops. "-Donnxruntime_ENABLE_CPU_FP16_OPS=" + ("ON" if args.enable_training else "OFF"), - "-Donnxruntime_USE_NCCL=" + ("OFF" if args.disable_nccl else "ON"), + "-Donnxruntime_USE_NCCL=" + ("ON" if args.enable_training and not args.disable_nccl else "OFF"), "-Donnxruntime_BUILD_BENCHMARKS=" + ("ON" if args.build_micro_benchmarks else "OFF"), "-Donnxruntime_USE_ROCM=" + ("ON" if args.use_rocm else "OFF"), "-DOnnxruntime_GCOV_COVERAGE=" + ("ON" if args.code_coverage else "OFF"), diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh index e9e8ef4b3481f..da86205b52f61 100755 --- a/tools/ci_build/github/linux/copy_strip_binary.sh +++ b/tools/ci_build/github/linux/copy_strip_binary.sh @@ -36,7 +36,7 @@ then strip -S $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib # copy the CoreML EP header for macOS build (libs with .dylib ext) - cp $SOURCE_DIR/onnxruntime/core/providers/coreml/coreml_execution_provider.h $BINARY_DIR/$ARTIFACT_NAME/include + cp $SOURCE_DIR/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h $BINARY_DIR/$ARTIFACT_NAME/include elif [[ $LIB_NAME == *.so.* ]] then ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so diff --git a/tools/ci_build/github/windows/jar_packaging.ps1 b/tools/ci_build/github/windows/jar_packaging.ps1 index 679e27b459efc..a132ba6b26e2a 100644 --- a/tools/ci_build/github/windows/jar_packaging.ps1 +++ b/tools/ci_build/github/windows/jar_packaging.ps1 @@ -16,8 +16,10 @@ Remove-Item -Path libcustom_op_library.dylib 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar . popd pushd onnxruntime-java-linux-aarch64 +Remove-Item -Path libcustom_op_library.so 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar . popd pushd onnxruntime-java-osx-arm64 +Remove-Item -Path libcustom_op_library.dylib 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar . popd