Skip to content

Commit

Permalink
Adjusting max batch size in order to build TRT models on smaller GPUs
Browse files Browse the repository at this point in the history
  • Loading branch information
Aleks committed Apr 2, 2024
1 parent 639f04d commit 2de2b1d
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def parse_arguments():
parser.add_argument("--quantize_dir", type=str, default="quantize/1-gpu")
parser.add_argument("--dtype", type=str, default="float16", choices=["float16"])
parser.add_argument("--log_level", type=str, default="info")
parser.add_argument("--max_batch_size", type=int, default=64)
parser.add_argument("--max_batch_size", type=int, default=24)
parser.add_argument("--max_input_len", type=int, default=4)
parser.add_argument("--max_output_len", type=int, default=448)
parser.add_argument("--max_beam_width", type=int, default=1)
Expand Down
2 changes: 1 addition & 1 deletion src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def get_session(self, engine_dir, runtime_mapping, debug_mode=False):

# TODO: Make dynamic max_batch_size and max_beam_width
decoder_model_config = ModelConfig(
max_batch_size=64,
max_batch_size=24,
max_beam_width=1,
num_heads=self.decoder_config["num_heads"],
num_kv_heads=self.decoder_config["num_heads"],
Expand Down

0 comments on commit 2de2b1d

Please sign in to comment.