Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry for release 1.12.0 final #12218

Merged
merged 15 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,

// T5 does not need position, so next_positions is empty for T5.
if (!beam_state->next_positions.empty()) {
memset(beam_state->next_positions.data(), 0, beam_state->next_positions.size_bytes());
gsl::copy(sequence_lengths, beam_state->next_positions);
}

Expand Down Expand Up @@ -274,13 +273,13 @@ Status ProcessLogits(const OrtValue& logits, //
// Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1)
gsl::span<T>& next_token_scores = beam_state->next_token_scores;
ORT_RETURN_IF_ERROR(
SoftmaxCPU<T>(
batch_beam_size, // rows
vocab_size, // elements per row
(input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(),
next_token_scores.data(),
true,
thread_pool));
SoftmaxCPU<T>(
batch_beam_size, // rows
vocab_size, // elements per row
(input_length == 1 && logits_batch_size == batch_beam_size) ? logits_data : next_token_logits.data(),
next_token_scores.data(),
true,
thread_pool));

#ifdef DEBUG_BEAM_SEARCH
dumper->Print("next_token_scores after softmax", next_token_scores.data(), batch_size, num_beams, vocab_size);
Expand Down Expand Up @@ -428,12 +427,12 @@ Status UpdateGptFeeds(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper) {
int gpt_subgraph_first_present_output_idx) {
// last_outputs: logits, present_0, present_1, ...
// next_inputs: input_ids, position_id, attention_mask, past_0, past_1
ORT_UNUSED_PARAMETER(stream);
Expand All @@ -454,10 +453,12 @@ Status UpdateGptFeeds(
}
next_inputs[0] = input_ids;

// Update position IDs
int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
for (int i = 0; i < batch_beam_size; i++) {
position_data[i]++;
if (increase_position) {
// Update position IDs
int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
for (int i = 0; i < batch_beam_size; i++) {
position_data[i]++;
}
}
next_inputs[1] = position_ids;

Expand All @@ -477,14 +478,6 @@ Status UpdateGptFeeds(
}
next_inputs[2] = attention_mask;

#ifdef DEBUG_BEAM_SEARCH
dumper->Print("input_ids", input_ids);
dumper->Print("position_ids", position_ids);
dumper->Print("attention_mask", attention_mask);
#else
ORT_UNUSED_PARAMETER(dumper);
#endif

// Update past state
if (num_beams == 1) {
// feed present_* output to past_* inputs one by one
Expand Down Expand Up @@ -725,12 +718,12 @@ template Status UpdateGptFeeds<float>(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper);
int gpt_subgraph_first_present_output_idx);

template Status UpdateDecoderFeeds<float>(
AllocatorPtr allocator,
Expand All @@ -751,28 +744,28 @@ template Status UpdateDecoderFeeds<float>(
template void ExpandInputs<int32_t>(const OrtValue& input, int num_beams, AllocatorPtr allocator, OrtValue& expanded);

template Status ExpandBuffer<int32_t>(
void* stream,
const OrtValue& input,
int num_beams,
AllocatorPtr allocator,
OrtValue& expanded,
bool only_copy_shape);
void* stream,
const OrtValue& input,
int num_beams,
AllocatorPtr allocator,
OrtValue& expanded,
bool only_copy_shape);

template Status ExpandBuffer<float>(
void* stream,
const OrtValue& input,
int num_beams,
AllocatorPtr allocator,
OrtValue& expanded,
bool only_copy_shape);
void* stream,
const OrtValue& input,
int num_beams,
AllocatorPtr allocator,
OrtValue& expanded,
bool only_copy_shape);

template Status ExpandBuffer<MLFloat16>(
void* stream,
const OrtValue& input,
int num_beams,
AllocatorPtr allocator,
OrtValue& expanded,
bool only_copy_shape);
void* stream,
const OrtValue& input,
int num_beams,
AllocatorPtr allocator,
OrtValue& expanded,
bool only_copy_shape);

} // namespace BeamSearchCpuDeviceHelper
} // namespace contrib
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,12 @@ using UpdateGptFeedsFunc = std::function<Status(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper)>;
int gpt_subgraph_first_present_output_idx)>;

// Create encoder inputs (for encoder-decoder model like T5).
using CreateEncoderInputsFunc = std::function<Status(
Expand Down Expand Up @@ -142,7 +142,6 @@ using ExpandBufferFunc = std::function<Status(
bool only_copy_shape)>;
} // namespace BeamSearchDeviceHelper


// These are CPU specific device helper implementations
namespace BeamSearchCpuDeviceHelper {
Status TopK(
Expand Down Expand Up @@ -208,12 +207,12 @@ Status UpdateGptFeeds(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper);
int gpt_subgraph_first_present_output_idx);

// ---------------------------------------------------------------
// Functions for encoder-decoder model like T5
Expand Down
25 changes: 18 additions & 7 deletions onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices);

Expand Down Expand Up @@ -93,6 +94,7 @@ Status BeamSearchGpt<T>::UpdateFeeds(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices) {
return update_feeds_func_(this->temp_space_allocator_,
Expand All @@ -101,12 +103,12 @@ Status BeamSearchGpt<T>::UpdateFeeds(
next_inputs,
current_length,
position_ids,
increase_position,
beam_next_tokens,
beam_indices,
this->parameters_->num_beams,
gpt_subgraph_.GetFirstPastInputIndex(),
gpt_subgraph_.GetFirstPresentOutputIndex(),
this->GetConsoleDumper());
gpt_subgraph_.GetFirstPresentOutputIndex());
}

template <typename T>
Expand Down Expand Up @@ -186,11 +188,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage

#ifdef DEBUG_BEAM_SEARCH
const IConsoleDumper* dumper = this->GetConsoleDumper();
dumper->Print("input_ids", feeds[0]);
dumper->Print("position_ids", feeds[1]);
dumper->Print("attention_mask", feeds[2]);
#endif

// Position ids for all iterations except the first. It uses memory buffer owned by next_positions.
OrtValue position_ids;
int64_t dims[] = {parameters->BatchBeamSize(), 1};
Expand All @@ -205,9 +203,19 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
int iteration_counter = 0;
while (current_length < parameters->max_length) {
iteration_counter++;

#ifdef DEBUG_BEAM_SEARCH
auto cur_len = std::to_string(current_length);
dumper->Print("***CurrentLength", cur_len, true);
dumper->Print("iteration", iteration_counter, true);

dumper->Print("input_ids", feeds[0]);
dumper->Print("position_ids", feeds[1]);
dumper->Print("attention_mask", feeds[2]);
for (size_t i = 3; i < feeds.size(); i++) {
dumper->Print("past", static_cast<int>(i) - 3, true);
dumper->Print("", feeds[i]);
}
#endif

status = utils::ExecuteSubgraph(this->decoder_session_state_,
Expand Down Expand Up @@ -241,8 +249,11 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage

// Prepare inputs for next round of subgraph call.
if (current_length < parameters->max_length) {
// For the first iteration, position_ids is initialized as sequence lengths. We can add it to feeds directly.
// For the remaining iterations, we need increase position_ids first, then add it to feeds.
bool increase_position = (iteration_counter > 1);
ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length,
position_ids,
position_ids, increase_position,
beam_next_tokens.as_span<const int32_t>(),
beam_indices.as_span<const int32_t>()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,8 @@ Status ProcessLogits(const OrtValue& logits, //

// The output will be float for consideration of precision and easy integration with remaining parts.
float* Y_data = next_token_scores.data();
const CudaT* X_data = (input_length == 1 && logits_batch_size == batch_beam_size) ?
logits_data :
reinterpret_cast<const CudaT*>(next_token_logits.data());
bool is_single_token = (input_length == 1 && logits_batch_size == batch_beam_size);
const CudaT* X_data = is_single_token ? logits_data : reinterpret_cast<const CudaT*>(next_token_logits.data());

dispatch_blockwise_softmax_forward<CudaT, float, float, true>(
cuda_stream, Y_data, X_data, vocab_size, vocab_size, batch_size * num_beams);
Expand Down Expand Up @@ -500,12 +499,12 @@ Status UpdateGptFeeds(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper) {
int gpt_subgraph_first_present_output_idx) {
// Update input_ids with next tokens.
int batch_beam_size = static_cast<int>(beam_next_tokens.length());
int64_t dims[] = {batch_beam_size, 1};
Expand All @@ -519,7 +518,7 @@ Status UpdateGptFeeds(
next_inputs[0] = input_ids;

// Update position IDs
int32_t* position_data = position_ids.GetMutable<Tensor>()->MutableData<int32_t>();
int32_t* position_data = increase_position ? position_ids.GetMutable<Tensor>()->MutableData<int32_t>() : nullptr;
next_inputs[1] = position_ids;

// Update attention mask
Expand All @@ -538,14 +537,6 @@ Status UpdateGptFeeds(

next_inputs[2] = attention_mask;

#ifdef DEBUG_BEAM_SEARCH
dumper->Print("input_ids", input_ids);
dumper->Print("position_ids", position_ids);
dumper->Print("attention_mask", attention_mask);
#else
ORT_UNUSED_PARAMETER(dumper);
#endif

// Update past state
if (num_beams == 1) {
const int k = gpt_subgraph_first_past_input_idx - gpt_subgraph_first_present_output_idx;
Expand Down Expand Up @@ -662,12 +653,12 @@ Status ExpandBuffer(void* stream,
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < num_beams; j++) {
CUDA_RETURN_IF_ERROR(
cudaMemcpyAsync(
target,
input_data + i * chunk_size,
sizeof(T) * chunk_size,
cudaMemcpyDeviceToDevice,
cuda_stream));
cudaMemcpyAsync(
target,
input_data + i * chunk_size,
sizeof(T) * chunk_size,
cudaMemcpyDeviceToDevice,
cuda_stream));
target += chunk_size;
}
}
Expand Down Expand Up @@ -714,12 +705,12 @@ template Status UpdateGptFeeds<float>(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper);
int gpt_subgraph_first_present_output_idx);

// Float16
template void InitBeamState<MLFloat16>(transformers::IBeamSearchState<MLFloat16>* beam_state,
Expand Down Expand Up @@ -748,12 +739,12 @@ template Status UpdateGptFeeds<MLFloat16>(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper);
int gpt_subgraph_first_present_output_idx);

template Status UpdateDecoderFeeds<float>(
AllocatorPtr allocator,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ Status UpdateGptFeeds(
std::vector<OrtValue>& next_inputs,
int current_length,
OrtValue& position_ids,
bool increase_position,
gsl::span<const int32_t> beam_next_tokens,
gsl::span<const int32_t> beam_indices,
int num_beams,
int gpt_subgraph_first_past_input_idx,
int gpt_subgraph_first_present_output_idx,
const transformers::IConsoleDumper* dumper);
int gpt_subgraph_first_present_output_idx);

// ---------------------------------------------------------------
// Functions for encoder-decoder model like T5
Expand Down
8 changes: 5 additions & 3 deletions onnxruntime/contrib_ops/cuda/transformers/beam_search_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,11 @@ __global__ void UpdateGptInputsKernel(const T* old_mask_data,
int j = index % current_length;
mask_data[index] = (j < current_length - 1) ? old_mask_data[i * (current_length - 1) + j] : static_cast<T>(1);

// Update sequence length (or next positions).
if (index < batch_beam_size) {
next_positions[index]++;
if (next_positions != nullptr) {
// Update sequence length (or next positions).
if (index < batch_beam_size) {
next_positions[index]++;
}
}
}
}
Expand Down
14 changes: 14 additions & 0 deletions onnxruntime/core/common/cpuid_arch_definition.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

// This file defines the CPUIDINFO_ARCH_* symbols.

#pragma once

#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__i386__) || defined(__x86_64__)
#define CPUIDINFO_ARCH_X86
#endif

#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
#define CPUIDINFO_ARCH_ARM
#endif // ARM or ARM64
Loading