diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc
index 7b163dd923a31..68b1aab919c3d 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.cc
@@ -197,7 +197,6 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,
 
   // T5 does not need position, so next_positions is empty for T5.
   if (!beam_state->next_positions.empty()) {
-    memset(beam_state->next_positions.data(), 0, beam_state->next_positions.size_bytes());
     gsl::copy(sequence_lengths, beam_state->next_positions);
   }
 
@@ -274,13 +273,13 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   // Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1)
   gsl::span<T>& next_token_scores = beam_state->next_token_scores;
   ORT_RETURN_IF_ERROR(
-    SoftmaxCPU<T>(
-      batch_beam_size,  // rows
-      vocab_size,       // elements per row
-      (input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(),
-      next_token_scores.data(),
-      true,
-      thread_pool));
+      SoftmaxCPU<T>(
+          batch_beam_size,  // rows
+          vocab_size,       // elements per row
+          (input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(),
+          next_token_scores.data(),
+          true,
+          thread_pool));
 
 #ifdef DEBUG_BEAM_SEARCH
   dumper->Print("next_token_scores after softmax", next_token_scores.data(), batch_size, num_beams, vocab_size);
@@ -428,12 +427,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper) {
+    int gpt_subgraph_first_present_output_idx) {
   // last_outputs: logits, present_0, present_1, ...
   // next_inputs: input_ids, position_id, attention_mask, past_0, past_1
   ORT_UNUSED_PARAMETER(stream);
@@ -454,10 +453,12 @@ Status UpdateGptFeeds(
   }
   next_inputs[0] = input_ids;
 
-  // Update position IDs
-  int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
-  for (int i = 0; i < batch_beam_size; i++) {
-    position_data[i]++;
+  if (increase_position) {
+    // Update position IDs
+    int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
+    for (int i = 0; i < batch_beam_size; i++) {
+      position_data[i]++;
+    }
   }
   next_inputs[1] = position_ids;
 
@@ -477,14 +478,6 @@ Status UpdateGptFeeds(
   }
   next_inputs[2] = attention_mask;
 
-#ifdef DEBUG_BEAM_SEARCH
-  dumper->Print("input_ids", input_ids);
-  dumper->Print("position_ids", position_ids);
-  dumper->Print("attention_mask", attention_mask);
-#else
-  ORT_UNUSED_PARAMETER(dumper);
-#endif
-
   // Update past state
   if (num_beams == 1) {
     // feed present_* output to past_* inputs one by one
@@ -725,12 +718,12 @@ template Status UpdateGptFeeds<float>(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 template Status UpdateDecoderFeeds<float>(
     AllocatorPtr allocator,
@@ -751,28 +744,28 @@ template Status UpdateDecoderFeeds<float>(
 template void ExpandInputs<int32_t>(const OrtValue& input, int num_beams, AllocatorPtr allocator, OrtValue& expanded);
 
 template Status ExpandBuffer<int32_t>(
-  void* stream,
-  const OrtValue& input,
-  int num_beams,
-  AllocatorPtr allocator,
-  OrtValue& expanded,
-  bool only_copy_shape);
+    void* stream,
+    const OrtValue& input,
+    int num_beams,
+    AllocatorPtr allocator,
+    OrtValue& expanded,
+    bool only_copy_shape);
 
 template Status ExpandBuffer<float>(
-  void* stream,
-  const OrtValue& input,
-  int num_beams,
-  AllocatorPtr allocator,
-  OrtValue& expanded,
-  bool only_copy_shape);
+    void* stream,
+    const OrtValue& input,
+    int num_beams,
+    AllocatorPtr allocator,
+    OrtValue& expanded,
+    bool only_copy_shape);
 
 template Status ExpandBuffer<MLFloat16>(
-  void* stream,
-  const OrtValue& input,
-  int num_beams,
-  AllocatorPtr allocator,
-  OrtValue& expanded,
-  bool only_copy_shape);
+    void* stream,
+    const OrtValue& input,
+    int num_beams,
+    AllocatorPtr allocator,
+    OrtValue& expanded,
+    bool only_copy_shape);
 
 }  // namespace BeamSearchCpuDeviceHelper
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h
index ab18eec25cde0..36ab8d8e93a98 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_device_helper.h
@@ -96,12 +96,12 @@ using UpdateGptFeedsFunc = std::function<Status(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper)>;
+    int gpt_subgraph_first_present_output_idx)>;
 
 // Create encoder inputs (for encoder-decoder model like T5).
 using CreateEncoderInputsFunc = std::function<Status(
@@ -142,7 +142,6 @@ using ExpandBufferFunc = std::function<Status(
     bool only_copy_shape)>;
 }  // namespace BeamSearchDeviceHelper
 
-
 // These are CPU specific device helper implementations
 namespace BeamSearchCpuDeviceHelper {
 Status TopK(
@@ -208,12 +207,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 // ---------------------------------------------------------------
 // Functions for encoder-decoder model like T5
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
index 9cf5daeba929d..7674c2a781055 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -56,6 +56,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
       std::vector<OrtValue>& next_inputs,
       int current_length,
       OrtValue& position_ids,
+      bool increase_position,
       gsl::span<const int32_t> beam_next_tokens,
       gsl::span<const int32_t> beam_indices);
 
@@ -93,6 +94,7 @@ Status BeamSearchGpt<T>::UpdateFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices) {
   return update_feeds_func_(this->temp_space_allocator_,
@@ -101,12 +103,12 @@ Status BeamSearchGpt<T>::UpdateFeeds(
                             next_inputs,
                             current_length,
                             position_ids,
+                            increase_position,
                             beam_next_tokens,
                             beam_indices,
                             this->parameters_->num_beams,
                             gpt_subgraph_.GetFirstPastInputIndex(),
-                            gpt_subgraph_.GetFirstPresentOutputIndex(),
-                            this->GetConsoleDumper());
+                            gpt_subgraph_.GetFirstPresentOutputIndex());
 }
 
 template <typename T>
@@ -186,11 +188,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
 
 #ifdef DEBUG_BEAM_SEARCH
   const IConsoleDumper* dumper = this->GetConsoleDumper();
-  dumper->Print("input_ids", feeds[0]);
-  dumper->Print("position_ids", feeds[1]);
-  dumper->Print("attention_mask", feeds[2]);
 #endif
-
   // Position ids for all iterations except the first. It uses memory buffer owned by next_positions.
   OrtValue position_ids;
   int64_t dims[] = {parameters->BatchBeamSize(), 1};
@@ -205,9 +203,19 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
   int iteration_counter = 0;
   while (current_length < parameters->max_length) {
     iteration_counter++;
+
 #ifdef DEBUG_BEAM_SEARCH
     auto cur_len = std::to_string(current_length);
     dumper->Print("***CurrentLength", cur_len, true);
+    dumper->Print("iteration", iteration_counter, true);
+
+    dumper->Print("input_ids", feeds[0]);
+    dumper->Print("position_ids", feeds[1]);
+    dumper->Print("attention_mask", feeds[2]);
+    for (size_t i = 3; i < feeds.size(); i++) {
+      dumper->Print("past", static_cast<int>(i) - 3, true);
+      dumper->Print("", feeds[i]);
+    }
 #endif
 
     status = utils::ExecuteSubgraph(this->decoder_session_state_,
@@ -241,8 +249,11 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
 
     // Prepare inputs for next round of subgraph call.
     if (current_length < parameters->max_length) {
+      // For the first iteration, position_ids is initialized as sequence lengths. We can add it to feeds directly.
+      // For the remaining iterations, we need increase position_ids first, then add it to feeds.
+      bool increase_position = (iteration_counter > 1);
       ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length,
-                                      position_ids,
+                                      position_ids, increase_position,
                                       beam_next_tokens.as_span<const int32_t>(),
                                       beam_indices.as_span<const int32_t>()));
     }
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc
index b712908259da1..780e98909c603 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.cc
@@ -258,9 +258,8 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 
   // The output will be float for consideration of precision and easy integration with remaining parts.
   float* Y_data = next_token_scores.data();
-  const CudaT* X_data = (input_length == 1 && logits_batch_size == batch_beam_size) ?
-                        logits_data :
-                        reinterpret_cast<const CudaT*>(next_token_logits.data());
+  bool is_single_token = (input_length == 1 && logits_batch_size == batch_beam_size);
+  const CudaT* X_data = is_single_token ? logits_data : reinterpret_cast<const CudaT*>(next_token_logits.data());
 
   dispatch_blockwise_softmax_forward<CudaT, float, float, true>(
       cuda_stream, Y_data, X_data, vocab_size, vocab_size, batch_size * num_beams);
@@ -500,12 +499,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper) {
+    int gpt_subgraph_first_present_output_idx) {
   // Update input_ids with next tokens.
   int batch_beam_size = static_cast<int>(beam_next_tokens.length());
   int64_t dims[] = {batch_beam_size, 1};
@@ -519,7 +518,7 @@ Status UpdateGptFeeds(
   next_inputs[0] = input_ids;
 
   // Update position IDs
-  int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
+  int32_t* position_data = increase_position ? position_ids.GetMutable<Tensor>()->MutableData<int32_t>() : nullptr;
   next_inputs[1] = position_ids;
 
   // Update attention mask
@@ -538,14 +537,6 @@ Status UpdateGptFeeds(
 
   next_inputs[2] = attention_mask;
 
-#ifdef DEBUG_BEAM_SEARCH
-  dumper->Print("input_ids", input_ids);
-  dumper->Print("position_ids", position_ids);
-  dumper->Print("attention_mask", attention_mask);
-#else
-  ORT_UNUSED_PARAMETER(dumper);
-#endif
-
   // Update past state
   if (num_beams == 1) {
     const int k = gpt_subgraph_first_past_input_idx - gpt_subgraph_first_present_output_idx;
@@ -662,12 +653,12 @@ Status ExpandBuffer(void* stream,
   for (int i = 0; i < batch_size; i++) {
     for (int j = 0; j < num_beams; j++) {
       CUDA_RETURN_IF_ERROR(
-        cudaMemcpyAsync(
-          target,
-          input_data + i * chunk_size,
-          sizeof(T) * chunk_size,
-          cudaMemcpyDeviceToDevice,
-          cuda_stream));
+          cudaMemcpyAsync(
+              target,
+              input_data + i * chunk_size,
+              sizeof(T) * chunk_size,
+              cudaMemcpyDeviceToDevice,
+              cuda_stream));
       target += chunk_size;
     }
   }
@@ -714,12 +705,12 @@ template Status UpdateGptFeeds<float>(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 // Float16
 template void InitBeamState<MLFloat16>(transformers::IBeamSearchState<MLFloat16>* beam_state,
@@ -748,12 +739,12 @@ template Status UpdateGptFeeds<MLFloat16>(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 template Status UpdateDecoderFeeds<float>(
     AllocatorPtr allocator,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h
index 14f64e923e781..4424fee6d5cb2 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_device_helper.h
@@ -68,12 +68,12 @@ Status UpdateGptFeeds(
     std::vector<OrtValue>& next_inputs,
     int current_length,
     OrtValue& position_ids,
+    bool increase_position,
     gsl::span<const int32_t> beam_next_tokens,
     gsl::span<const int32_t> beam_indices,
     int num_beams,
     int gpt_subgraph_first_past_input_idx,
-    int gpt_subgraph_first_present_output_idx,
-    const transformers::IConsoleDumper* dumper);
+    int gpt_subgraph_first_present_output_idx);
 
 // ---------------------------------------------------------------
 // Functions for encoder-decoder model like T5
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu
index 4f93b1dded937..6bc52758c7cc3 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu
@@ -248,9 +248,11 @@ __global__ void UpdateGptInputsKernel(const T* old_mask_data,
     int j = index % current_length;
     mask_data[index] = (j < current_length - 1) ? old_mask_data[i * (current_length - 1) + j] : static_cast<T>(1);
 
-    // Update sequence length (or next positions).
-    if (index < batch_beam_size) {
-      next_positions[index]++;
+    if (next_positions != nullptr) {
+      // Update sequence length (or next positions).
+      if (index < batch_beam_size) {
+        next_positions[index]++;
+      }
     }
   }
 }
diff --git a/onnxruntime/core/common/cpuid_arch_definition.h b/onnxruntime/core/common/cpuid_arch_definition.h
new file mode 100644
index 0000000000000..a541eb66d8ba3
--- /dev/null
+++ b/onnxruntime/core/common/cpuid_arch_definition.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file defines the CPUIDINFO_ARCH_* symbols.
+
+#pragma once
+
+#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__i386__) || defined(__x86_64__)
+#define CPUIDINFO_ARCH_X86
+#endif
+
+#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
+#define CPUIDINFO_ARCH_ARM
+#endif  // ARM or ARM64
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index f76f0b0a1527c..ff535d889386d 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -4,14 +4,7 @@
 #pragma once
 
 #include "core/common/common.h"
-
-#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__i386__) || defined(__x86_64__)
-#define CPUIDINFO_ARCH_X86
-#endif
-
-#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
-#define CPUIDINFO_ARCH_ARM
-#endif  // ARM or ARM64
+#include "core/common/cpuid_arch_definition.h"
 
 namespace onnxruntime {
 
@@ -31,7 +24,7 @@ class CPUIDInfo {
   bool HasSSE4_1() const { return has_sse4_1_; }
   bool IsHybrid() const { return is_hybrid_; }
 
-  // ARM 
+  // ARM
   bool HasArmNeonDot() const { return has_arm_neon_dot_; }
 
   uint32_t GetCurrentCoreIdx() const;
@@ -72,7 +65,7 @@ class CPUIDInfo {
     }
     return is_armv8_narrow_ld_[coreId];
   }
-  
+
   /**
   * @brief Some ARMv8 power efficient core has narrower 64b load/store
   *        that needs specialized optimiztion in kernels
diff --git a/onnxruntime/core/common/cpuid_uarch.cc b/onnxruntime/core/common/cpuid_uarch.cc
new file mode 100644
index 0000000000000..e9d8de9732b15
--- /dev/null
+++ b/onnxruntime/core/common/cpuid_uarch.cc
@@ -0,0 +1,369 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/cpuid_uarch.h"
+
+#include "core/common/logging/logging.h"
+
+namespace onnxruntime {
+
+#if defined(CPUIDINFO_ARCH_ARM)
+
+#define CPUINFO_ARM_MIDR_IMPLEMENTER_MASK UINT32_C(0xFF000000)
+#define CPUINFO_ARM_MIDR_VARIANT_MASK UINT32_C(0x00F00000)
+#define CPUINFO_ARM_MIDR_ARCHITECTURE_MASK UINT32_C(0x000F0000)
+#define CPUINFO_ARM_MIDR_PART_MASK UINT32_C(0x0000FFF0)
+#define CPUINFO_ARM_MIDR_REVISION_MASK UINT32_C(0x0000000F)
+
+#define CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET 24
+#define CPUINFO_ARM_MIDR_VARIANT_OFFSET 20
+#define CPUINFO_ARM_MIDR_ARCHITECTURE_OFFSET 16
+#define CPUINFO_ARM_MIDR_PART_OFFSET 4
+#define CPUINFO_ARM_MIDR_REVISION_OFFSET 0
+
+inline static uint32_t midr_get_implementer(uint32_t midr) {
+  return (midr & CPUINFO_ARM_MIDR_IMPLEMENTER_MASK) >> CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET;
+}
+
+inline static uint32_t midr_get_part(uint32_t midr) {
+  return (midr & CPUINFO_ARM_MIDR_PART_MASK) >> CPUINFO_ARM_MIDR_PART_OFFSET;
+}
+
+inline static uint32_t midr_get_variant(uint32_t midr) {
+  return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET;
+}
+
+void decodeMIDR(
+    uint32_t midr,
+    uint32_t uarch[1]) {
+  switch (midr_get_implementer(midr)) {
+    case 'A':
+      switch (midr_get_part(midr)) {
+          //#if defined(_M_ARM) || defined(__arm__)
+        case 0xC05:
+          *uarch = cpuinfo_uarch_cortex_a5;
+          break;
+        case 0xC07:
+          *uarch = cpuinfo_uarch_cortex_a7;
+          break;
+        case 0xC08:
+          *uarch = cpuinfo_uarch_cortex_a8;
+          break;
+        case 0xC09:
+          *uarch = cpuinfo_uarch_cortex_a9;
+          break;
+        case 0xC0C:
+          *uarch = cpuinfo_uarch_cortex_a12;
+          break;
+        case 0xC0E:
+          *uarch = cpuinfo_uarch_cortex_a17;
+          break;
+        case 0xC0D:
+          /*
+					 * Rockchip RK3288 only.
+					 * Core information is ambiguous: some sources specify Cortex-A12, others - Cortex-A17.
+					 * Assume it is Cortex-A12.
+					 */
+          *uarch = cpuinfo_uarch_cortex_a12;
+          break;
+        case 0xC0F:
+          *uarch = cpuinfo_uarch_cortex_a15;
+          break;
+          //#endif /* ARM */
+        case 0xD01:
+          *uarch = cpuinfo_uarch_cortex_a32;
+          break;
+        case 0xD03:
+          *uarch = cpuinfo_uarch_cortex_a53;
+          break;
+        case 0xD04:
+          *uarch = cpuinfo_uarch_cortex_a35;
+          break;
+        case 0xD05:
+          // Note: use Variant, not Revision, field
+          *uarch = (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) == 0 ? cpuinfo_uarch_cortex_a55r0 : cpuinfo_uarch_cortex_a55;
+          break;
+        case 0xD06:
+          *uarch = cpuinfo_uarch_cortex_a65;
+          break;
+        case 0xD07:
+          *uarch = cpuinfo_uarch_cortex_a57;
+          break;
+        case 0xD08:
+          *uarch = cpuinfo_uarch_cortex_a72;
+          break;
+        case 0xD09:
+          *uarch = cpuinfo_uarch_cortex_a73;
+          break;
+        case 0xD0A:
+          *uarch = cpuinfo_uarch_cortex_a75;
+          break;
+        case 0xD0B:
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xD0C:
+          *uarch = cpuinfo_uarch_neoverse_n1;
+          break;
+          //#endif /* ARM64 && !defined(__ANDROID__) */
+        case 0xD0D:
+          *uarch = cpuinfo_uarch_cortex_a77;
+          break;
+        case 0xD0E: /* Cortex-A76AE */
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+        case 0xD41: /* Cortex-A78 */
+          *uarch = cpuinfo_uarch_cortex_a78;
+          break;
+        case 0xD44: /* Cortex-X1 */
+          *uarch = cpuinfo_uarch_cortex_x1;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xD4A:
+          *uarch = cpuinfo_uarch_neoverse_e1;
+          break;
+          //#endif /* ARM64 && !defined(__ANDROID__) */
+        default:
+          switch (midr_get_part(midr) >> 8) {
+              //#if defined(_M_ARM) || defined(__arm__)
+            case 7:
+              *uarch = cpuinfo_uarch_arm7;
+              break;
+            case 9:
+              *uarch = cpuinfo_uarch_arm9;
+              break;
+            case 11:
+              *uarch = cpuinfo_uarch_arm11;
+              break;
+              //#endif /* ARM */
+            default:
+              LOGS_DEFAULT(WARNING) << "unknown ARM CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+          }
+      }
+      break;
+    case 'B':
+      switch (midr_get_part(midr)) {
+        case 0x00F:
+          *uarch = cpuinfo_uarch_brahma_b15;
+          break;
+        case 0x100:
+          *uarch = cpuinfo_uarch_brahma_b53;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0x516:
+          /* Broadcom Vulkan was sold to Cavium before it reached the market, so we identify it as Cavium ThunderX2 */
+          *uarch = cpuinfo_uarch_thunderx2;
+          break;
+          //#endif
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Broadcom CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+    case 'C':
+      switch (midr_get_part(midr)) {
+        case 0x0A0: /* ThunderX */
+        case 0x0A1: /* ThunderX 88XX */
+        case 0x0A2: /* ThunderX 81XX */
+        case 0x0A3: /* ThunderX 83XX */
+          *uarch = cpuinfo_uarch_thunderx;
+          break;
+        case 0x0AF: /* ThunderX2 99XX */
+          *uarch = cpuinfo_uarch_thunderx2;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Cavium CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#endif
+    case 'H':
+      switch (midr_get_part(midr)) {
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xD01: /* Kunpeng 920 series */
+          *uarch = cpuinfo_uarch_taishan_v110;
+          break;
+          //#endif
+        case 0xD40: /* Kirin 980 Big/Medium cores -> Cortex-A76 */
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Huawei CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#if defined(_M_ARM) || defined(__arm__)
+    case 'i':
+      switch (midr_get_part(midr) >> 8) {
+        case 2: /* PXA 210/25X/26X */
+        case 4: /* PXA 27X */
+        case 6: /* PXA 3XX */
+          *uarch = cpuinfo_uarch_xscale;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Intel CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#endif /* ARM */
+    case 'N':
+      switch (midr_get_part(midr)) {
+        case 0x000:
+          *uarch = cpuinfo_uarch_denver;
+          break;
+        case 0x003:
+          *uarch = cpuinfo_uarch_denver2;
+          break;
+        case 0x004:
+          *uarch = cpuinfo_uarch_carmel;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Nvidia CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+#if !defined(__ANDROID__)
+    case 'P':
+      switch (midr_get_part(midr)) {
+        case 0x000:
+          *uarch = cpuinfo_uarch_xgene;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Applied Micro CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+#endif
+    case 'Q':
+      switch (midr_get_part(midr)) {
+          // #if defined(_M_ARM) || defined(__arm__)
+        case 0x00F:
+          /* Mostly Scorpions, but some Cortex A5 may report this value as well */
+          //if (has_vfpv4) {
+          //  /* Unlike Scorpion, Cortex-A5 comes with VFPv4 */
+          //  *vendor = cpuinfo_vendor_arm;
+          //  *uarch = cpuinfo_uarch_cortex_a5;
+          //} else {
+          *uarch = cpuinfo_uarch_scorpion;
+          //          }
+          break;
+        case 0x02D: /* Dual-core Scorpions */
+          *uarch = cpuinfo_uarch_scorpion;
+          break;
+        case 0x04D:
+          /*
+					 * Dual-core Krait:
+					 * - r1p0 -> Krait 200
+					 * - r1p4 -> Krait 200
+					 * - r2p0 -> Krait 300
+					 */
+        case 0x06F:
+          /*
+					 * Quad-core Krait:
+					 * - r0p1 -> Krait 200
+					 * - r0p2 -> Krait 200
+					 * - r1p0 -> Krait 300
+					 * - r2p0 -> Krait 400 (Snapdragon 800 MSMxxxx)
+					 * - r2p1 -> Krait 400 (Snapdragon 801 MSMxxxxPRO)
+					 * - r3p1 -> Krait 450
+					 */
+          *uarch = cpuinfo_uarch_krait;
+          break;
+          //#endif              /* ARM */
+        case 0x201: /* Qualcomm Snapdragon 821: Low-power Kryo "Silver" */
+        case 0x205: /* Qualcomm Snapdragon 820 & 821: High-performance Kryo "Gold" */
+        case 0x211: /* Qualcomm Snapdragon 820: Low-power Kryo "Silver" */
+          *uarch = cpuinfo_uarch_kryo;
+          break;
+        case 0x800: /* High-performance Kryo 260 (r10p2) / Kryo 280 (r10p1) "Gold" -> Cortex-A73 */
+          *uarch = cpuinfo_uarch_cortex_a73;
+          break;
+        case 0x801: /* Low-power Kryo 260 / 280 "Silver" -> Cortex-A53 */
+          *uarch = cpuinfo_uarch_cortex_a53;
+          break;
+        case 0x802: /* High-performance Kryo 385 "Gold" -> Cortex-A75 */
+          *uarch = cpuinfo_uarch_cortex_a75;
+          break;
+        case 0x803: /* Low-power Kryo 385 "Silver" -> Cortex-A55r0 */
+          *uarch = cpuinfo_uarch_cortex_a55r0;
+          break;
+        case 0x804: /* High-performance Kryo 485 "Gold" / "Gold Prime" -> Cortex-A76 */
+          *uarch = cpuinfo_uarch_cortex_a76;
+          break;
+        case 0x805: /* Low-performance Kryo 485 "Silver" -> Cortex-A55 */
+          *uarch = cpuinfo_uarch_cortex_a55;
+          break;
+          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
+        case 0xC00:
+          *uarch = cpuinfo_uarch_falkor;
+          break;
+        case 0xC01:
+          *uarch = cpuinfo_uarch_saphira;
+          break;
+          //#endif /* ARM64 && !defined(__ANDROID__) */
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Qualcomm CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+    case 'S':
+      switch (midr & (CPUINFO_ARM_MIDR_VARIANT_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
+        case 0x00100010:
+          /*
+					 * Exynos 8890 MIDR = 0x531F0011, assume Exynos M1 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x001
+					 */
+          *uarch = cpuinfo_uarch_exynos_m1;
+          break;
+        case 0x00400010:
+          /*
+					 * Exynos 8895 MIDR = 0x534F0010, assume Exynos M2 has:
+					 * - CPU variant 0x4
+					 * - CPU part 0x001
+					 */
+          *uarch = cpuinfo_uarch_exynos_m2;
+          break;
+        case 0x00100020:
+          /*
+					 * Exynos 9810 MIDR = 0x531F0020, assume Exynos M3 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x002
+					 */
+          *uarch = cpuinfo_uarch_exynos_m3;
+          break;
+        case 0x00100030:
+          /*
+					 * Exynos 9820 MIDR = 0x531F0030, assume Exynos M4 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x003
+					 */
+          *uarch = cpuinfo_uarch_exynos_m4;
+          break;
+        case 0x00100040:
+          /*
+					 * Exynos 9820 MIDR = 0x531F0040, assume Exynos M5 has:
+					 * - CPU variant 0x1
+					 * - CPU part 0x004
+					 */
+          *uarch = cpuinfo_uarch_exynos_m5;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Samsung CPU variant 0x"
+                                << std::hex << midr_get_variant(midr) << " part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#if defined(_M_ARM) || defined(__arm__)
+    case 'V':
+      switch (midr_get_part(midr)) {
+        case 0x581: /* PJ4 / PJ4B */
+        case 0x584: /* PJ4B-MP / PJ4C */
+          *uarch = cpuinfo_uarch_pj4;
+          break;
+        default:
+          LOGS_DEFAULT(WARNING) << "unknown Marvell CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
+      }
+      break;
+      //#endif /* ARM */
+    default:
+      LOGS_DEFAULT(WARNING) << "unknown CPU uarch from MIDR value: 0x" << std::hex << midr;
+  }
+}
+
+#endif  // arm or arm64
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/common/cpuid_uarch.h b/onnxruntime/core/common/cpuid_uarch.h
index 73a05c4b91a3a..c781561292209 100644
--- a/onnxruntime/core/common/cpuid_uarch.h
+++ b/onnxruntime/core/common/cpuid_uarch.h
@@ -22,6 +22,14 @@ Module Name:
 
 --*/
 
+#pragma once
+
+#include <cstdint>
+
+#include "core/common/cpuid_arch_definition.h"
+
+namespace onnxruntime {
+
 enum CPUIDINFOuarch {
   /** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */
   cpuinfo_uarch_unknown = 0,
@@ -175,359 +183,8 @@ enum CPUIDINFOuarch {
 
 #if defined(CPUIDINFO_ARCH_ARM)
 
-#define CPUINFO_ARM_MIDR_IMPLEMENTER_MASK UINT32_C(0xFF000000)
-#define CPUINFO_ARM_MIDR_VARIANT_MASK UINT32_C(0x00F00000)
-#define CPUINFO_ARM_MIDR_ARCHITECTURE_MASK UINT32_C(0x000F0000)
-#define CPUINFO_ARM_MIDR_PART_MASK UINT32_C(0x0000FFF0)
-#define CPUINFO_ARM_MIDR_REVISION_MASK UINT32_C(0x0000000F)
-
-#define CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET 24
-#define CPUINFO_ARM_MIDR_VARIANT_OFFSET 20
-#define CPUINFO_ARM_MIDR_ARCHITECTURE_OFFSET 16
-#define CPUINFO_ARM_MIDR_PART_OFFSET 4
-#define CPUINFO_ARM_MIDR_REVISION_OFFSET 0
-
-inline static uint32_t midr_get_implementer(uint32_t midr) {
-  return (midr & CPUINFO_ARM_MIDR_IMPLEMENTER_MASK) >> CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET;
-}
-
-inline static uint32_t midr_get_part(uint32_t midr) {
-  return (midr & CPUINFO_ARM_MIDR_PART_MASK) >> CPUINFO_ARM_MIDR_PART_OFFSET;
-}
-
-inline static uint32_t midr_get_variant(uint32_t midr) {
-  return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET;
-}
-
-static void decodeMIDR(
-    uint32_t midr,
-    uint32_t uarch[1]) {
-  switch (midr_get_implementer(midr)) {
-    case 'A':
-      switch (midr_get_part(midr)) {
-          //#if defined(_M_ARM) || defined(__arm__)
-        case 0xC05:
-          *uarch = cpuinfo_uarch_cortex_a5;
-          break;
-        case 0xC07:
-          *uarch = cpuinfo_uarch_cortex_a7;
-          break;
-        case 0xC08:
-          *uarch = cpuinfo_uarch_cortex_a8;
-          break;
-        case 0xC09:
-          *uarch = cpuinfo_uarch_cortex_a9;
-          break;
-        case 0xC0C:
-          *uarch = cpuinfo_uarch_cortex_a12;
-          break;
-        case 0xC0E:
-          *uarch = cpuinfo_uarch_cortex_a17;
-          break;
-        case 0xC0D:
-          /*
-					 * Rockchip RK3288 only.
-					 * Core information is ambiguous: some sources specify Cortex-A12, others - Cortex-A17.
-					 * Assume it is Cortex-A12.
-					 */
-          *uarch = cpuinfo_uarch_cortex_a12;
-          break;
-        case 0xC0F:
-          *uarch = cpuinfo_uarch_cortex_a15;
-          break;
-          //#endif /* ARM */
-        case 0xD01:
-          *uarch = cpuinfo_uarch_cortex_a32;
-          break;
-        case 0xD03:
-          *uarch = cpuinfo_uarch_cortex_a53;
-          break;
-        case 0xD04:
-          *uarch = cpuinfo_uarch_cortex_a35;
-          break;
-        case 0xD05:
-          // Note: use Variant, not Revision, field
-          *uarch = (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) == 0 ? cpuinfo_uarch_cortex_a55r0 : cpuinfo_uarch_cortex_a55;
-          break;
-        case 0xD06:
-          *uarch = cpuinfo_uarch_cortex_a65;
-          break;
-        case 0xD07:
-          *uarch = cpuinfo_uarch_cortex_a57;
-          break;
-        case 0xD08:
-          *uarch = cpuinfo_uarch_cortex_a72;
-          break;
-        case 0xD09:
-          *uarch = cpuinfo_uarch_cortex_a73;
-          break;
-        case 0xD0A:
-          *uarch = cpuinfo_uarch_cortex_a75;
-          break;
-        case 0xD0B:
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xD0C:
-          *uarch = cpuinfo_uarch_neoverse_n1;
-          break;
-          //#endif /* ARM64 && !defined(__ANDROID__) */
-        case 0xD0D:
-          *uarch = cpuinfo_uarch_cortex_a77;
-          break;
-        case 0xD0E: /* Cortex-A76AE */
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-        case 0xD41: /* Cortex-A78 */
-          *uarch = cpuinfo_uarch_cortex_a78;
-          break;
-        case 0xD44: /* Cortex-X1 */
-          *uarch = cpuinfo_uarch_cortex_x1;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xD4A:
-          *uarch = cpuinfo_uarch_neoverse_e1;
-          break;
-          //#endif /* ARM64 && !defined(__ANDROID__) */
-        default:
-          switch (midr_get_part(midr) >> 8) {
-              //#if defined(_M_ARM) || defined(__arm__)
-            case 7:
-              *uarch = cpuinfo_uarch_arm7;
-              break;
-            case 9:
-              *uarch = cpuinfo_uarch_arm9;
-              break;
-            case 11:
-              *uarch = cpuinfo_uarch_arm11;
-              break;
-              //#endif /* ARM */
-            default:
-              LOGS_DEFAULT(WARNING) << "unknown ARM CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-          }
-      }
-      break;
-    case 'B':
-      switch (midr_get_part(midr)) {
-        case 0x00F:
-          *uarch = cpuinfo_uarch_brahma_b15;
-          break;
-        case 0x100:
-          *uarch = cpuinfo_uarch_brahma_b53;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0x516:
-          /* Broadcom Vulkan was sold to Cavium before it reached the market, so we identify it as Cavium ThunderX2 */
-          *uarch = cpuinfo_uarch_thunderx2;
-          break;
-          //#endif
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Broadcom CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-    case 'C':
-      switch (midr_get_part(midr)) {
-        case 0x0A0: /* ThunderX */
-        case 0x0A1: /* ThunderX 88XX */
-        case 0x0A2: /* ThunderX 81XX */
-        case 0x0A3: /* ThunderX 83XX */
-          *uarch = cpuinfo_uarch_thunderx;
-          break;
-        case 0x0AF: /* ThunderX2 99XX */
-          *uarch = cpuinfo_uarch_thunderx2;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Cavium CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#endif
-    case 'H':
-      switch (midr_get_part(midr)) {
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xD01: /* Kunpeng 920 series */
-          *uarch = cpuinfo_uarch_taishan_v110;
-          break;
-          //#endif
-        case 0xD40: /* Kirin 980 Big/Medium cores -> Cortex-A76 */
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Huawei CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#if defined(_M_ARM) || defined(__arm__)
-    case 'i':
-      switch (midr_get_part(midr) >> 8) {
-        case 2: /* PXA 210/25X/26X */
-        case 4: /* PXA 27X */
-        case 6: /* PXA 3XX */
-          *uarch = cpuinfo_uarch_xscale;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Intel CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#endif /* ARM */
-    case 'N':
-      switch (midr_get_part(midr)) {
-        case 0x000:
-          *uarch = cpuinfo_uarch_denver;
-          break;
-        case 0x003:
-          *uarch = cpuinfo_uarch_denver2;
-          break;
-        case 0x004:
-          *uarch = cpuinfo_uarch_carmel;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Nvidia CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-#if !defined(__ANDROID__)
-    case 'P':
-      switch (midr_get_part(midr)) {
-        case 0x000:
-          *uarch = cpuinfo_uarch_xgene;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Applied Micro CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-#endif
-    case 'Q':
-      switch (midr_get_part(midr)) {
-          // #if defined(_M_ARM) || defined(__arm__)
-        case 0x00F:
-          /* Mostly Scorpions, but some Cortex A5 may report this value as well */
-          //if (has_vfpv4) {
-          //  /* Unlike Scorpion, Cortex-A5 comes with VFPv4 */
-          //  *vendor = cpuinfo_vendor_arm;
-          //  *uarch = cpuinfo_uarch_cortex_a5;
-          //} else {
-          *uarch = cpuinfo_uarch_scorpion;
-          //          }
-          break;
-        case 0x02D: /* Dual-core Scorpions */
-          *uarch = cpuinfo_uarch_scorpion;
-          break;
-        case 0x04D:
-          /*
-					 * Dual-core Krait:
-					 * - r1p0 -> Krait 200
-					 * - r1p4 -> Krait 200
-					 * - r2p0 -> Krait 300
-					 */
-        case 0x06F:
-          /*
-					 * Quad-core Krait:
-					 * - r0p1 -> Krait 200
-					 * - r0p2 -> Krait 200
-					 * - r1p0 -> Krait 300
-					 * - r2p0 -> Krait 400 (Snapdragon 800 MSMxxxx)
-					 * - r2p1 -> Krait 400 (Snapdragon 801 MSMxxxxPRO)
-					 * - r3p1 -> Krait 450
-					 */
-          *uarch = cpuinfo_uarch_krait;
-          break;
-          //#endif              /* ARM */
-        case 0x201: /* Qualcomm Snapdragon 821: Low-power Kryo "Silver" */
-        case 0x205: /* Qualcomm Snapdragon 820 & 821: High-performance Kryo "Gold" */
-        case 0x211: /* Qualcomm Snapdragon 820: Low-power Kryo "Silver" */
-          *uarch = cpuinfo_uarch_kryo;
-          break;
-        case 0x800: /* High-performance Kryo 260 (r10p2) / Kryo 280 (r10p1) "Gold" -> Cortex-A73 */
-          *uarch = cpuinfo_uarch_cortex_a73;
-          break;
-        case 0x801: /* Low-power Kryo 260 / 280 "Silver" -> Cortex-A53 */
-          *uarch = cpuinfo_uarch_cortex_a53;
-          break;
-        case 0x802: /* High-performance Kryo 385 "Gold" -> Cortex-A75 */
-          *uarch = cpuinfo_uarch_cortex_a75;
-          break;
-        case 0x803: /* Low-power Kryo 385 "Silver" -> Cortex-A55r0 */
-          *uarch = cpuinfo_uarch_cortex_a55r0;
-          break;
-        case 0x804: /* High-performance Kryo 485 "Gold" / "Gold Prime" -> Cortex-A76 */
-          *uarch = cpuinfo_uarch_cortex_a76;
-          break;
-        case 0x805: /* Low-performance Kryo 485 "Silver" -> Cortex-A55 */
-          *uarch = cpuinfo_uarch_cortex_a55;
-          break;
-          //#if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(__ANDROID__)
-        case 0xC00:
-          *uarch = cpuinfo_uarch_falkor;
-          break;
-        case 0xC01:
-          *uarch = cpuinfo_uarch_saphira;
-          break;
-          //#endif /* ARM64 && !defined(__ANDROID__) */
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Qualcomm CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-    case 'S':
-      switch (midr & (CPUINFO_ARM_MIDR_VARIANT_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
-        case 0x00100010:
-          /*
-					 * Exynos 8890 MIDR = 0x531F0011, assume Exynos M1 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x001
-					 */
-          *uarch = cpuinfo_uarch_exynos_m1;
-          break;
-        case 0x00400010:
-          /*
-					 * Exynos 8895 MIDR = 0x534F0010, assume Exynos M2 has:
-					 * - CPU variant 0x4
-					 * - CPU part 0x001
-					 */
-          *uarch = cpuinfo_uarch_exynos_m2;
-          break;
-        case 0x00100020:
-          /*
-					 * Exynos 9810 MIDR = 0x531F0020, assume Exynos M3 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x002
-					 */
-          *uarch = cpuinfo_uarch_exynos_m3;
-          break;
-        case 0x00100030:
-          /*
-					 * Exynos 9820 MIDR = 0x531F0030, assume Exynos M4 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x003
-					 */
-          *uarch = cpuinfo_uarch_exynos_m4;
-          break;
-        case 0x00100040:
-          /*
-					 * Exynos 9820 MIDR = 0x531F0040, assume Exynos M5 has:
-					 * - CPU variant 0x1
-					 * - CPU part 0x004
-					 */
-          *uarch = cpuinfo_uarch_exynos_m5;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Samsung CPU variant 0x"
-                                << std::hex << midr_get_variant(midr) << " part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#if defined(_M_ARM) || defined(__arm__)
-    case 'V':
-      switch (midr_get_part(midr)) {
-        case 0x581: /* PJ4 / PJ4B */
-        case 0x584: /* PJ4B-MP / PJ4C */
-          *uarch = cpuinfo_uarch_pj4;
-          break;
-        default:
-          LOGS_DEFAULT(WARNING) << "unknown Marvell CPU part 0x" << std::hex << midr_get_part(midr) << " ignored";
-      }
-      break;
-      //#endif /* ARM */
-    default:
-      LOGS_DEFAULT(WARNING) << "unknown CPU uarch from MIDR value: 0x" << std::hex << midr;
-  }
-}
+void decodeMIDR(uint32_t midr, uint32_t uarch[1]);
 
 #endif  // arm or arm64
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
index 6cdb63319a5bf..0e996ec98a730 100644
--- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
+++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
@@ -2001,7 +2001,9 @@ OptimizeResult OptimizeImpl(OptimizerCtx& ctx) {
         continue;
       }
 
-      if (!HandleQuantizeDequantizeScale(ctx.graph, *perm, *dq_node, ctx.opset)) {
+      // we're moving the Transpose to before the DQ, so we need to use the inverse permutations to update the axis
+      // attribute correctly when doing per-axis dequantization
+      if (!HandleQuantizeDequantizeScale(ctx.graph, InvertPerm(*perm), *dq_node, ctx.opset)) {
         continue;
       }
 
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 4358ddc7c3e26..c18075cc7b4b9 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -40,6 +40,23 @@ namespace onnxruntime {
 
 namespace {
 
+class UnmapFileParam {
+ public:
+  void* addr;
+  size_t len;
+};
+
+static void UnmapFile(void* param) noexcept {
+  UnmapFileParam* p = reinterpret_cast<UnmapFileParam*>(param);
+  bool ret = UnmapViewOfFile(p->addr);
+  if (!ret) {
+    const auto error_code = GetLastError();
+    LOGS_DEFAULT(ERROR) << "unmap view of file failed. error code: " << error_code
+                        << " error msg: " << std::system_category().message(error_code);
+  }
+  delete p;
+}
+
 std::wstring Basename(const std::wstring& path) {
   auto basename_index = path.find_last_of(L"/\\") + 1;  // results in 0 if no separator is found
   return path.substr(basename_index);
@@ -320,8 +337,95 @@ class WindowsEnv : public Env {
     return Status::OK();
   }
 
+  /**
   Status MapFileIntoMemory(_In_z_ const ORTCHAR_T*, FileOffsetType, size_t, MappedMemoryPtr&) const override {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "MapFileIntoMemory is not implemented on Windows.");
+  }*/
+
+  Status MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
+                           FileOffsetType offset,
+                           size_t length,
+                           MappedMemoryPtr& mapped_memory) const override {
+    ORT_RETURN_IF_NOT(file_path, "file_path == nullptr");
+    ORT_RETURN_IF_NOT(offset >= 0, "offset < 0");
+
+    if (length == 0) {
+      mapped_memory = MappedMemoryPtr{};
+      return Status::OK();
+    }
+
+#if WINVER >= _WIN32_WINNT_WIN8
+    wil::unique_hfile file_handle{
+        CreateFile2(file_path, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL)};
+#else
+    wil::unique_hfile file_handle{
+        CreateFileW(file_path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)};
+#endif
+    if (file_handle.get() == INVALID_HANDLE_VALUE) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+          "open file ", ToUTF8String(Basename(file_path)),
+          " fail, errcode = ", error_code,
+          " - ", std::system_category().message(error_code));
+    }
+
+#if NTDDI_VERSION >= NTDDI_WIN10_RS5
+    wil::unique_hfile file_mapping_handle{
+        CreateFileMapping2(file_handle.get(),
+                           nullptr,
+                           FILE_MAP_READ,
+                           PAGE_READONLY,
+                           SEC_COMMIT,
+                           0,
+                           nullptr,
+                           nullptr,
+                           0)};
+#else
+    wil::unique_hfile file_mapping_handle{
+        CreateFileMappingW(file_handle.get(),
+                           nullptr,
+                           PAGE_READONLY,
+                           0,
+                           0,
+                           nullptr)};
+#endif
+    if (file_mapping_handle.get() == INVALID_HANDLE_VALUE) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+          "open file mapping ", ToUTF8String(Basename(file_path)),
+          " fail, errcode = ", error_code,
+          " - ", std::system_category().message(error_code));
+    }
+
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+
+    static const DWORD page_size = sysinfo.dwPageSize;
+    static const DWORD allocation_granularity = sysinfo.dwAllocationGranularity;
+    const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
+    const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
+    const FileOffsetType mapped_offset = offset - offset_to_page;
+    if (mapped_offset % allocation_granularity != 0) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+          "mapped offset must be a multiple of the allocation granularity",
+          " , mapped_offset = ", mapped_offset,
+          " , allocation_granularity = ", allocation_granularity,
+          " , errcode = ", error_code,
+          " - ", std::system_category().message(error_code));
+    }
+
+    void* const mapped_base = MapViewOfFile(file_mapping_handle.get(),
+                                            FILE_MAP_READ,
+                                            0,
+                                            static_cast<DWORD>(mapped_offset),
+                                            mapped_length);
+
+    mapped_memory =
+        MappedMemoryPtr{reinterpret_cast<char*>(mapped_base) + offset_to_page,
+                        OrtCallbackInvoker{OrtCallback{UnmapFile, new UnmapFileParam{mapped_base, mapped_length}}}};
+
+    return Status::OK();
   }
 
   bool FolderExists(const std::wstring& path) const override {
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.h b/onnxruntime/core/providers/cpu/object_detection/roialign.h
index 9ba7f89caf4d4..1bb8bd34c5cb2 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.h
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.h
@@ -29,7 +29,6 @@ class RoiAlignBase {
       } else {
         ORT_THROW("Invalid mode of value ", mode, " specified. It should be either avg or max");
       }
-      mode_ = mode == "avg" ? RoiAlignMode::avg : RoiAlignMode::max;
     }
 
     // output_height
@@ -64,6 +63,13 @@ class RoiAlignBase {
       else
         half_pixel_ = false;
     }
+
+    if (mode_ == RoiAlignMode::max && sampling_ratio_ != 1) {
+      // TODO(fdwr): Issue #6146. ORT 1.13 will correct the incorrect summation of max mode with PR #7354.
+      LOGS_DEFAULT(WARNING) << "The existing summation for max mode and sampling ratios besides 1 is incorrect "
+                            << "and will be fixed in the next ORT 1.13 release. Thus the results of RoiAlign "
+                            << "will be different.";
+    }
   }
 
  protected:
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index 68e2be1b34cf7..cfeb9a220277e 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -5,6 +5,8 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
 #include "graph_transform_test_builder.h"
 
 #include "core/graph/graph.h"
@@ -3620,7 +3622,6 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearTransposePropagation) {
     EXPECT_EQ(op_types_in_order, expected_op_types_in_order);
   };
 
-
   TransformerTester(build_test_case_1,
                     check_graph,
                     TransformerLevel::Default,
@@ -4047,5 +4048,41 @@ TEST(TransposeOptimizerTests, RegressionTest_GitHubIssue10305) {
   ASSERT_STATUS_OK(session_object.Load(model_uri));
   ASSERT_STATUS_OK(session_object.Initialize());  // optimizers run during initialization
 }
+
+// regression test for a model with DQ node with per-axis dequantization followed by a Transpose.
+// the second phase can swap those around, but needs to use the correct perms for updating the 'axis'
+// attribute in the DQ node.
+// see https://github.com/microsoft/onnxruntime/issues/12151 for more details.
+TEST(TransposeOptimizerTests, RegressionTest_GitHubIssue12151) {
+  Status status;
+  auto model_uri = ORT_TSTR("testdata/ort_github_issue_12151.onnx");
+
+  NameMLValMap feeds;  // no inputs for this model
+  std::vector<std::string> output_names{"Z"};
+  std::vector<OrtValue> fetches_orig;
+  std::vector<OrtValue> fetches;
+
+  SessionOptions so;
+  so.session_logid = "TransposeOptimizerTests.RegressionTest_GitHubIssue12151";
+
+  {
+    so.graph_optimization_level = TransformerLevel::Default;  // off
+    InferenceSession session_object{so, GetEnvironment()};
+    ASSERT_STATUS_OK(session_object.Load(model_uri));
+    ASSERT_STATUS_OK(session_object.Initialize());
+    ASSERT_STATUS_OK(session_object.Run(feeds, output_names, &fetches_orig));
+  }
+
+  {
+    so.graph_optimization_level = TransformerLevel::Level1;  // enable transpose optimizer
+    InferenceSession session_object{so, GetEnvironment()};
+    ASSERT_STATUS_OK(session_object.Load(model_uri));
+    ASSERT_STATUS_OK(session_object.Initialize());
+    ASSERT_STATUS_OK(session_object.Run(feeds, output_names, &fetches));
+  }
+
+  ASSERT_THAT(fetches_orig[0].Get<Tensor>().DataAsSpan<float>(),
+              testing::ContainerEq(fetches[0].Get<Tensor>().DataAsSpan<float>()));
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
index e4670583f11b9..9dec13767ef90 100644
--- a/onnxruntime/test/platform/file_io_test.cc
+++ b/onnxruntime/test/platform/file_io_test.cc
@@ -10,6 +10,8 @@
 
 #ifndef _WIN32
 #include <unistd.h>  // for sysconf() and _SC_PAGESIZE
+#else
+#include <Windows.h>
 #endif
 
 #include "gsl/gsl"
@@ -61,7 +63,11 @@ std::vector<char> GenerateData(size_t length, uint32_t seed = 0) {
 }
 
 void WriteDataToFile(gsl::span<const char> data, const PathString& path) {
+#ifndef _WIN32
   std::ofstream out{path, std::ios_base::out | std::ios_base::trunc};
+#else
+  std::ofstream out{path, std::ios_base::out | std::ios_base::trunc | std::ios_base::binary};
+#endif
   out.write(data.data(), data.size());
 }
 
@@ -144,6 +150,59 @@ TEST(FileIoTest, MapFileIntoMemory) {
     ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
   }
 }
+#else
+TEST(FileIoTest, MapFileIntoMemory) {
+  SYSTEM_INFO sysinfo;
+  GetSystemInfo(&sysinfo);
+  static const auto page_size = sysinfo.dwPageSize;
+  static const auto allocation_granularity = sysinfo.dwAllocationGranularity;
+  ASSERT_GT(page_size, static_cast<DWORD>(0));
+
+  TempFilePath tmp(ORT_TSTR("map_file_test_"));
+  const auto expected_data = GenerateData(page_size * 3 / 2);
+  WriteDataToFile(gsl::make_span(expected_data), tmp.path);
+
+  const auto offsets_and_lengths = GenerateValidOffsetLengthPairs(
+      0, expected_data.size(), page_size / 10);
+
+  for (const auto& offset_and_length : offsets_and_lengths) {
+    const auto offset = offset_and_length.first;
+    const auto length = offset_and_length.second;
+
+    // The offset must be a multiple of the allocation granularity
+    if (offset % allocation_granularity != 0) {
+      continue;
+    }
+
+    Env::MappedMemoryPtr mapped_memory{};
+    auto status = Env::Default().MapFileIntoMemory(
+        tmp.path.c_str(), offset, length, mapped_memory);
+    ASSERT_TRUE(status.IsOK())
+        << "MapFileIntoMemory failed for offset " << offset << " and length " << length
+        << " with error: " << status.ErrorMessage();
+
+    auto mapped_span = gsl::make_span(mapped_memory.get(), length);
+
+    auto expected_data_span = gsl::make_span(expected_data.data() + offset, length);
+
+    ASSERT_EQ(mapped_span, expected_data_span);
+  }
+
+  {
+    Env::MappedMemoryPtr mapped_memory{};
+
+    // invalid - offset is not a multiple of the allocation granularity
+    ASSERT_FALSE(Env::Default().MapFileIntoMemory(
+        tmp.path.c_str(), allocation_granularity * 3 / 2, page_size / 10, mapped_memory).IsOK());
+  }
+
+  {
+    Env::MappedMemoryPtr mapped_memory{};
+
+    // invalid - negative offset
+    ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
+  }
+}
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/testdata/ort_github_issue_12151.onnx b/onnxruntime/test/testdata/ort_github_issue_12151.onnx
new file mode 100644
index 0000000000000..f796b46f1bdc2
Binary files /dev/null and b/onnxruntime/test/testdata/ort_github_issue_12151.onnx differ
diff --git a/orttraining/orttraining/python/training/optim/_ds_modifier.py b/orttraining/orttraining/python/training/optim/_ds_modifier.py
index d9515041f5dfa..6ae6ccee51184 100644
--- a/orttraining/orttraining/python/training/optim/_ds_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_ds_modifier.py
@@ -10,10 +10,11 @@
 # - has_overflow_partitioned_grads_serial : https://github.com/microsoft/DeepSpeed/blob/d8e9ef6f99e27bb95e10bd146d145b3372b4cfda/deepspeed/runtime/zero/stage2.py#L1799
 # --------------------------------------------------------------------------
 
-import torch
 import types
 import warnings
 from distutils.version import LooseVersion
+
+import torch
 from numpy import inf
 
 from ._modifier import FP16OptimizerModifier, check_overflow, check_overflow_for_grads
@@ -27,14 +28,11 @@ def __init__(self, optimizer, **kwargs) -> None:
         super().__init__(optimizer)
 
     def can_be_modified(self):
-        try:
-            import deepspeed
-
-            v = LooseVersion(deepspeed.__version__)
-            if v > LooseVersion("0.5.4") or v < LooseVersion("0.4.0"):
-                warnings.warn("Unsupported DeepSpeed version to override, skipped.", UserWarning)
-                return False
-        except Exception as _:
+        import deepspeed
+
+        ds_version = LooseVersion(deepspeed.__version__)
+        if ds_version > LooseVersion("0.6.5") or ds_version < LooseVersion("0.4.0"):
+            warnings.warn("Skip modifying optimizer because of unsupported DeepSpeed version.", UserWarning)
             return False
 
         return self.check_requirements(
@@ -141,7 +139,8 @@ def has_overflow_partitioned_grads_serial(target):
             #### END OF THE ORIGINAL IMPLEMENTATION ####
 
             #### THIS IS THE FASTER IMPLEMENTATION ####
-            for i in range(len(target.fp16_groups)):
+            groups = target.fp16_groups if hasattr(target, "fp16_groups") else target.bit16_groups
+            for i in range(len(groups)):
                 grad_data = [grad.data for grad in target.averaged_gradients[i] if grad is not None]
                 if check_overflow_for_grads(grad_data):
                     return True
diff --git a/orttraining/orttraining/python/training/optim/_modifier.py b/orttraining/orttraining/python/training/optim/_modifier.py
index 9897ed41210e6..b3ad73110d34a 100644
--- a/orttraining/orttraining/python/training/optim/_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_modifier.py
@@ -9,6 +9,7 @@
 # --------------------------------------------------------------------------
 
 import torch
+import warnings
 from numpy import inf
 from ._multi_tensor_apply import MultiTensorApply
 
@@ -32,12 +33,16 @@ def check_requirements(self, required_funcs, require_apex=False, require_torch_n
             if require_torch_non_finite_check is True:
                 _ = torch._amp_foreach_non_finite_check_and_unscale_
         except Exception as _:
+            warnings.warn("Skip modifying optimizer because of Apex or torch_non_finite_check not found.", UserWarning)
             return False
 
         if required_funcs:
             for func_name in required_funcs:
                 func = getattr(self._optimizer, func_name, None)
                 if not func or not callable(func):
+                    warnings.warn(
+                        "Skip modifying optimizer because of specific function not found in optimizer.", UserWarning
+                    )
                     return False
         return True
 
diff --git a/orttraining/orttraining/python/training/optim/_modifier_registry.py b/orttraining/orttraining/python/training/optim/_modifier_registry.py
index 142999f3f72c7..4291b792a4607 100644
--- a/orttraining/orttraining/python/training/optim/_modifier_registry.py
+++ b/orttraining/orttraining/python/training/optim/_modifier_registry.py
@@ -7,12 +7,9 @@
 from ._megatron_modifier import LegacyMegatronLMModifier
 from ._apex_amp_modifier import ApexAMPModifier
 
-LEAGCY_MEGATRON_LM_OPTIMIZER_NAME = "megatron.fp16.fp16.FP16_Optimizer"
-DEEPSPEED_ZERO1_AND_ZERO2_OPTIMIZER_NAME = "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer"
-APEX_AMP_OPTIMIZER_NAME = "apex.amp.optimizer.unique_name_as_id"
-
 OptimizerModifierTypeRegistry = {
-    LEAGCY_MEGATRON_LM_OPTIMIZER_NAME: LegacyMegatronLMModifier,
-    DEEPSPEED_ZERO1_AND_ZERO2_OPTIMIZER_NAME: DeepSpeedZeROModifier,
-    APEX_AMP_OPTIMIZER_NAME: ApexAMPModifier,
+    "megatron.fp16.fp16.FP16_Optimizer": LegacyMegatronLMModifier,
+    "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
+    "deepspeed.runtime.zero.stage_1_and_2.DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
+    "apex.amp.optimizer.unique_name_as_id": ApexAMPModifier,
 }
diff --git a/orttraining/orttraining/python/training/optim/fp16_optimizer.py b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
index c4c353249f1ee..c3864ea711f24 100644
--- a/orttraining/orttraining/python/training/optim/fp16_optimizer.py
+++ b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+import warnings
+
 from ._modifier_registry import OptimizerModifierTypeRegistry
 
 
@@ -90,6 +92,7 @@ def get_full_qualified_type_name(o):
 
     optimizer_full_qualified_name = get_full_qualified_type_name(optimizer)
     if optimizer_full_qualified_name not in OptimizerModifierTypeRegistry:
+        warnings.warn("Skip modifying optimizer because of optimizer name not found in registry.", UserWarning)
         return optimizer
 
     modifier = OptimizerModifierTypeRegistry[optimizer_full_qualified_name](optimizer, **kwargs)
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
index 63af43ce48eb7..1459d3b86dcdb 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
@@ -4,14 +4,17 @@
 # --------------------------------------------------------------------------
 
 import sys
+import warnings
+
 import torch
 import torch.utils.checkpoint
-import warnings
+from packaging import version
 from torch.onnx import symbolic_helper
 
 from onnxruntime.capi._pybind_state import register_torch_autograd_function
-from ._fallback import _FallbackManager, ORTModuleONNXModelException, ORTModuleTorchModelException, wrap_exception
+
 from . import _logger
+from ._fallback import ORTModuleONNXModelException, ORTModuleTorchModelException, _FallbackManager, wrap_exception
 
 # Some autograd.Function's shouldn't be exported as PythonOp.
 # If CheckpointFunction is exported as PythonOp, the checkpointed computation
@@ -37,7 +40,15 @@ def _export_pt_1_10(g, n, *args, **kwargs):
                 "wrap exportable sub-nn.Module's as ORTModule."
             )
         inplace = kwargs["inplace"]
-        training_mode = symbolic_helper._training_mode
+        # TODO move to public API once exporter team exposes that
+        training_mode = None
+        runtime_pytorch_version = version.parse(torch.__version__.split("+")[0])
+        if runtime_pytorch_version > version.parse("1.11"):
+            from torch.onnx import _globals
+
+            training_mode = _globals.GLOBALS.training_mode
+        else:
+            training_mode = symbolic_helper._training_mode
         cconv = n.cconv()
         input_tensor_types = []
         input_requires_grads = []
diff --git a/requirements-training.txt b/requirements-training.txt
index 4b1be6cef9b7c..82f0331314da6 100644
--- a/requirements-training.txt
+++ b/requirements-training.txt
@@ -4,6 +4,6 @@ h5py
 numpy >= 1.16.6
 onnx
 packaging
-protobuf
+protobuf >= 3.12.2, <= 3.20.1
 sympy
 setuptools>=41.4.0
diff --git a/setup.py b/setup.py
index db13319174440..0805a593723cb 100644
--- a/setup.py
+++ b/setup.py
@@ -463,10 +463,10 @@ def finalize_options(self):
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.6",
     "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
 ]
 
 if not enable_training:
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index f7567fbee8192..08cccff5e3033 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -875,7 +875,7 @@ def generate_build_tree(
         "-Donnxruntime_ENABLE_TRAINING_TORCH_INTEROP=" + ("ON" if args.enable_training_torch_interop else "OFF"),
         # Enable advanced computations such as AVX for some traininig related ops.
         "-Donnxruntime_ENABLE_CPU_FP16_OPS=" + ("ON" if args.enable_training else "OFF"),
-        "-Donnxruntime_USE_NCCL=" + ("OFF" if args.disable_nccl else "ON"),
+        "-Donnxruntime_USE_NCCL=" + ("ON" if args.enable_training and not args.disable_nccl else "OFF"),
         "-Donnxruntime_BUILD_BENCHMARKS=" + ("ON" if args.build_micro_benchmarks else "OFF"),
         "-Donnxruntime_USE_ROCM=" + ("ON" if args.use_rocm else "OFF"),
         "-DOnnxruntime_GCOV_COVERAGE=" + ("ON" if args.code_coverage else "OFF"),
diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh
index e9e8ef4b3481f..da86205b52f61 100755
--- a/tools/ci_build/github/linux/copy_strip_binary.sh
+++ b/tools/ci_build/github/linux/copy_strip_binary.sh
@@ -36,7 +36,7 @@ then
     strip -S $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME
     ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib
     # copy the CoreML EP header for macOS build (libs with .dylib ext)
-    cp $SOURCE_DIR/onnxruntime/core/providers/coreml/coreml_execution_provider.h  $BINARY_DIR/$ARTIFACT_NAME/include
+    cp $SOURCE_DIR/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
 elif [[ $LIB_NAME == *.so.* ]]
 then
     ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so
diff --git a/tools/ci_build/github/windows/jar_packaging.ps1 b/tools/ci_build/github/windows/jar_packaging.ps1
index 679e27b459efc..a132ba6b26e2a 100644
--- a/tools/ci_build/github/windows/jar_packaging.ps1
+++ b/tools/ci_build/github/windows/jar_packaging.ps1
@@ -16,8 +16,10 @@ Remove-Item -Path libcustom_op_library.dylib
 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar .
 popd
 pushd onnxruntime-java-linux-aarch64
+Remove-Item -Path libcustom_op_library.so
 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar .
 popd
 pushd onnxruntime-java-osx-arm64
+Remove-Item -Path libcustom_op_library.dylib
 7z a $Env:BUILD_BINARIESDIRECTORY\java-artifact\onnxruntime-java-win-x64\onnxruntime-$Env:ONNXRUNTIMEVERSION.jar .
 popd