Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exposes TensorRT-LLM finish reason to the server #2818

Draft
wants to merge 30 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0f17415
misc(cmake) update dependencies
mfuntowicz Nov 18, 2024
7a81040
feat(hardware) enable new hardware.hpp and unittests
mfuntowicz Nov 18, 2024
1830fe8
test(ctest) enable address sanitizer
mfuntowicz Nov 18, 2024
3a2698f
feat(backend): initial rewrite of the backend for simplicity
mfuntowicz Nov 18, 2024
6d35657
feat(backend): remove all the logs from hardware.hpp
mfuntowicz Nov 18, 2024
9bb6309
feat(backend): added some logging
mfuntowicz Nov 30, 2024
87272ff
feat(backend): enable compiler warning if support for RVO not applying
mfuntowicz Nov 30, 2024
702dc9c
feat(backend): missing return statement
mfuntowicz Nov 30, 2024
25c6bbe
feat(backend): introduce backend_workspace_t to store precomputed inf…
mfuntowicz Nov 30, 2024
df99164
feat(backend): delete previous backend impl
mfuntowicz Dec 1, 2024
fd7e2b5
feat(backend): more impl
mfuntowicz Dec 1, 2024
71e700a
feat(backend): use latest trtllm main version to have g++ >= 13 compa…
mfuntowicz Dec 1, 2024
879e1a4
feat(backend): allow overriding which Python to use
mfuntowicz Dec 2, 2024
a7bad25
feat(backend): fix backend_exception_t -> backend_error_t naming
mfuntowicz Dec 2, 2024
2f8634e
feat(backend): impl missing generation_step_t as return value of pull…
mfuntowicz Dec 2, 2024
874bc28
feat(backend): make backend_workspace_t::engines_folder constexpr
mfuntowicz Dec 3, 2024
16ba2f5
feat(backend): fix main.rs retrieving the tokenizer
mfuntowicz Dec 3, 2024
c94b9de
feat(backend): add guard to multiple header definitions
mfuntowicz Dec 3, 2024
ad3ed0d
test(backend): add more unittest
mfuntowicz Dec 3, 2024
881527a
feat(backend): remove constexpr from par
mfuntowicz Dec 3, 2024
6253064
feat(backend): remove constexpig
mfuntowicz Dec 3, 2024
cc6bc33
test(backend): more test coverage
mfuntowicz Dec 3, 2024
b6dbf60
chore(trtllm): update dependency towards 0.15.0
mfuntowicz Dec 4, 2024
460f290
effectively cancel the request on the executor
mfuntowicz Dec 4, 2024
300f6c6
feat(backend) fix moving backend when pulling
mfuntowicz Dec 4, 2024
b3cd5ea
feat(backend): make sure we can easily cancel request on the executor
mfuntowicz Dec 5, 2024
049f4ac
feat(backend): fix missing "0" field access
mfuntowicz Dec 5, 2024
f0cd474
misc(backend): fix reborrowing Pin<&mut T> as described in the doc ht…
mfuntowicz Dec 5, 2024
60059b6
feat(trtllm): expose finish reason to Rust
mfuntowicz Dec 10, 2024
b653605
feat(trtllm): fix logits retrieval
mfuntowicz Dec 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 1 addition & 54 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 13 additions & 8 deletions Dockerfile_trtllm
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
ARG OMPI_VERSION="4.1.6"
ARG OMPI_VERSION="4.1.7rc1"

# Build dependencies resolver stage
FROM lukemathwalker/cargo-chef:latest AS chef
Expand All @@ -10,26 +10,29 @@ COPY . .
RUN cargo chef prepare --recipe-path recipe.json

# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt update && apt install -y \
build-essential \
cmake \
curl \
gcc \
g++ \
gcc-14 \
g++-14 \
git \
git-lfs \
libssl-dev \
libucx-dev \
ninja-build \
pkg-config \
pipx \
python3 \
python3-dev \
python3-setuptools \
tar \
wget
wget && \
pipx ensurepath

ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
Expand Down Expand Up @@ -83,13 +86,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
cd backends/trtllm && \
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release

FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
python3 -m pip install transformers tokenizers
pipx ensurepath && \
pipx install --include-deps transformers tokenizers

WORKDIR /usr/local/tgi/bin

ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
ENV TOKENIZERS_PARALLELISM=false
ENV OMPI_MCA_plm_rsh_agent=""
Expand Down
73 changes: 57 additions & 16 deletions backends/trtllm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,75 +1,116 @@
cmake_minimum_required(VERSION 3.20)

if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
find_program(CCACHE_EXECUTABLE "ccache")
if (CCACHE_EXECUTABLE)
message(STATUS "Using ccache")
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
endif ()
else ()
find_program(CCACHE_EXECUTABLE ${CMAKE_CXX_COMPILER_LAUNCHER})
message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}")
set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
endif ()

if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif ()

project(tgi-trtllm-backend VERSION 1.0.0)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD 23)

include(FetchContent)
include(ExternalProject)
include(CheckCXXCompilerFlag)

option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path rgo where TensorRT libraries and headers are located")
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")

# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
find_package(MPI REQUIRED)

#### External dependencies ####
include(cmake/fmt.cmake)
include(cmake/json.cmake)
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(TGI_TRTLLM_BACKEND_DEBUG ON)
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
endif ()

if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD})
message(STATUS "Using lld linker")
add_link_options("-fuse-ld=lld")
endif ()

# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
message(STATUS "Enabling non-NVRO detection")
target_compile_options(tgi_trtllm_backend_impl "-Werror -Wnvro")
endif ()

# Let's build TRTLLM as part of CMake
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")

# Tell CMake to need try to override the RPATH for executorWorker as it has not information on how to do so
set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)

# TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_include_directories(tgi_trtllm_backend_impl PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
# $<INSTALL_INTERFACE:csrc>
)
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)

# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
endif ()


#### Unit Tests ####
if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
message(STATUS "Building tests")
FetchContent_Declare(
Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2
GIT_TAG v3.6.0
URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
)
FetchContent_MakeAvailable(Catch2)

# add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
# target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)

if (CMAKE_BUILD_TYPE MATCHES "Debug")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fsanitize=undefined")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fsanitize=undefined")
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
endif ()

list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
include(CTest)
include(Catch)
# catch_discover_tests(tgi_trtllm_backend_tests)
catch_discover_tests(tgi_trtllm_backend_tests)
endif ()
9 changes: 5 additions & 4 deletions backends/trtllm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@ homepage.workspace = true

[dependencies]
async-trait = "0.1"
async-stream = "0.3"
#async-stream = "0.3"
clap = { version = "4.5", features = ["derive"] }
cxx = "1.0"
hashbrown = "0.14"
hf-hub = { workspace = true }
log = { version = "0.4", features = [] }
#log = { version = "0.4", features = [] }
text-generation-router = { path = "../../router" }
tokenizers = { workspace = true }
tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
tokio-stream = "0.1.15"
thiserror = "1.0.63"
tracing = "0.1"
tracing-opentelemetry = "0.25"
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
#tracing-opentelemetry = "0.25"
#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
pyo3 = { workspace = true }

[build-dependencies]
cmake = "0.1"
Expand Down
Loading
Loading