Skip to content

Commit

Permalink
Update TensorRT-LLM (NVIDIA#1793)
Browse files Browse the repository at this point in the history
Co-authored-by: DreamGenX <[email protected]>
Co-authored-by: Ace-RR <[email protected]>
Co-authored-by: bprus <[email protected]>
Co-authored-by: janpetrov <[email protected]>
  • Loading branch information
5 people authored Jun 18, 2024
1 parent db4edea commit 2a115da
Show file tree
Hide file tree
Showing 318 changed files with 8,621 additions and 4,763 deletions.
2 changes: 1 addition & 1 deletion benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ ${HOME}/.local/bin/trtllm-build \
--output_dir ${LORA_ENGINE} \
--max_batch_size ${MAX_BATCH} \
--max_input_len $MAX_LEN \
--max_output_len $MAX_LEN \
--max_seq_len $((2*${MAX_LEN})) \
--gemm_plugin float16 \
--lora_plugin float16 \
--use_paged_context_fmha enable \
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/bertBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/rawEngine.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include "tensorrt_llm/runtime/tllmRuntime.h"
#include "tensorrt_llm/runtime/worldConfig.h"
Expand Down Expand Up @@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
{
auto const worldConfig = WorldConfig::mpi();
auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
auto engineBlob = loadEngine(enginePath.string());

for (float gpuWeightsPercent : gpuWeightsPercents)
{
auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);
auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
rt->addContext(0);
for (auto inLen : inLens)
{
Expand Down
13 changes: 13 additions & 0 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ struct BenchmarkParams
bool streaming{false};
bool enableExpDelays{false};
std::optional<float> requestRate{std::nullopt};
std::optional<SizeType32> maxBatchSize{std::nullopt};
int randomSeed = 430;
std::optional<int> maxAttentionWindow{std::nullopt};

Expand Down Expand Up @@ -785,6 +786,10 @@ class ExecutorServer
executorConfig.setPeftCacheConfig(peftCacheConfig);
executorConfig.setBatchingType(
modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT);
if (benchmarkParams.maxBatchSize)
{
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
}

mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);

Expand Down Expand Up @@ -1339,6 +1344,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
optionalParams.kvCacheConfig.onboardBlocks = benchmarkParams.kvOnboardBlocks;
optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent;
optionalParams.maxBeamWidth = beamWidth;
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};

auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
Expand Down Expand Up @@ -1628,6 +1634,7 @@ int main(int argc, char* argv[])
options.add_options()("request_rate",
"request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
cxxopts::value<float>());
options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value<int>());
options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
cxxopts::value<bool>()->default_value("false"));
options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
Expand Down Expand Up @@ -1777,6 +1784,12 @@ int main(int argc, char* argv[])
benchmarkParams.requestRate = result["request_rate"].as<float>();
}

// Argument: request rate
if (result.count("max_batch_size"))
{
benchmarkParams.maxBatchSize = result["max_batch_size"].as<int>();
}

benchmarkParams.enableExpDelays = result["enable_exp_delays"].as<bool>();

// Argument: Enable batch stats output
Expand Down
Loading

0 comments on commit 2a115da

Please sign in to comment.