-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Snippets] SplitDimensionM: heuristic update #28180
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,8 @@ | |
|
||
#include "snippets/pass/split_dimension_m.hpp" | ||
|
||
#include "snippets/utils/utils.hpp" | ||
#include "snippets/itt.hpp" | ||
#include "snippets/utils/utils.hpp" | ||
|
||
namespace { | ||
size_t get_dim_M(const ov::Shape& shape) { | ||
|
@@ -31,45 +31,55 @@ bool SplitDimensionM::is_supported_matmul(const std::shared_ptr<const ov::Node>& | |
return matmul && !matmul->get_transpose_a() && !matmul->is_dynamic(); | ||
} | ||
|
||
std::pair<size_t, size_t> SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { | ||
std::pair<size_t, size_t> splited = { 1, m_dim }; | ||
|
||
std::pair<size_t, size_t> SplitDimensionM::split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { | ||
// Ideal case #1: M can be split on the parts one of which complements the batch dimension to the optimal parallel work amount | ||
// In this case, each thread will execute the Snippets kernel once | ||
const size_t lower_bound = optimal_parallelism_work_amount / batch_dim; | ||
if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) { | ||
splited.first = lower_bound; | ||
splited.second = m_dim / lower_bound; | ||
OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); | ||
return splited; | ||
} | ||
if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) | ||
return std::make_pair(lower_bound, m_dim / lower_bound); | ||
|
||
// Ideal case #2: M is divisible by optimal parallel work amount, and the new_m_dim is big enough | ||
// In this case, each thread will execute the Snippets kernel 'batch_dim' times | ||
if (m_dim % optimal_parallelism_work_amount == 0) { | ||
const auto new_m_dim = m_dim / optimal_parallelism_work_amount; | ||
const size_t min_kernel_m = 64; | ||
if (new_m_dim >= min_kernel_m) { | ||
splited.first = optimal_parallelism_work_amount; | ||
splited.second = new_m_dim; | ||
OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); | ||
return splited; | ||
} | ||
if (new_m_dim >= min_kernel_m) | ||
return std::make_pair(optimal_parallelism_work_amount, new_m_dim); | ||
} | ||
|
||
return std::make_pair(1, m_dim); | ||
} | ||
|
||
std::pair<size_t, size_t> SplitDimensionM::split_conservatively_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { | ||
std::pair<size_t, size_t> splited = { 1, m_dim }; | ||
const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim); | ||
for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) { | ||
size_t divisor_1 = m_dim / divisor_0; | ||
if (divisor_1 * divisor_0 == m_dim) { | ||
splited.first = divisor_0; | ||
splited.second = divisor_1; | ||
break; | ||
} | ||
if (divisor_1 * divisor_0 == m_dim) | ||
return divisor_0 * batch_dim >= optimal_parallelism_work_amount ? std::make_pair(divisor_0, divisor_1) : splited; | ||
} | ||
OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); | ||
return splited; | ||
} | ||
|
||
std::pair<size_t, size_t> SplitDimensionM::split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { | ||
constexpr size_t min_kernel_m = 32; | ||
std::pair<size_t, size_t> best_result = {1, m_dim}; | ||
for (size_t divisor = 2; divisor < std::sqrt(m_dim); ++divisor) { | ||
if (m_dim % divisor != 0) | ||
continue; | ||
if (divisor >= min_kernel_m) | ||
return std::make_pair(m_dim / divisor, divisor); | ||
const size_t m_kernel = m_dim / divisor; | ||
if (m_kernel >= min_kernel_m) { | ||
best_result.first = divisor; | ||
best_result.second = m_kernel; | ||
} | ||
Comment on lines
+70
to
+76
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not clear from this code why we try to find maximal divisor? If
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main thing I want to point out is that this heuristic maximizes
My logic is structured in the following way (taking into account that divisor is ascending) for the splitting candidates:
Alternatively, I can implement the same logic via 2
This is true. But the current heuristic covers the most important cases (big shapes in SD topology), at least on the machines where these changes were tested. And we agreed offline that we need to limit these changes' impact on other topologies. |
||
} | ||
if (best_result.first * batch_dim >= optimal_parallelism_work_amount) | ||
return best_result; | ||
return std::make_pair(1, m_dim); | ||
} | ||
|
||
bool SplitDimensionM::can_be_optimized(const std::shared_ptr<const ov::Node>& node, size_t concurrency) { | ||
if (!is_supported_matmul(node)) | ||
return false; | ||
|
@@ -131,16 +141,30 @@ bool SplitDimensionM::split(const ov::Shape& shape, size_t optimal_parallelism_w | |
if (is_prime_number(m_dim)) | ||
return false; | ||
|
||
auto is_optimized = [&](size_t batch_dim) { | ||
return batch_dim >= optimal_parallelism_work_amount; | ||
}; | ||
|
||
// We skip optimization if the current batch is optimal for concurrency | ||
if (is_optimized(batch_dim)) | ||
if (batch_dim % optimal_parallelism_work_amount == 0) | ||
return false; | ||
|
||
std::tie(batch_m_dim, new_m_dim) = get_splited_dimensions(batch_dim, m_dim, optimal_parallelism_work_amount); | ||
return is_optimized(batch_dim * batch_m_dim); | ||
auto split_is_done = [&batch_m_dim]() { | ||
return batch_m_dim != 1; | ||
}; | ||
|
||
std::tie(batch_m_dim, new_m_dim) = split_ideally(batch_dim, m_dim, optimal_parallelism_work_amount); | ||
if (split_is_done()) | ||
return true; | ||
|
||
// If M dim is big enough, aggressive heuristic is used for kernel_m minimization. | ||
// For smaller M dim, conservative heuristic is used to preserve old behavour. | ||
const bool big_m_dim = m_dim >= 4000; | ||
Comment on lines
+156
to
+158
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By the way, we also can support the case with small I don't insist to do it in this PR but I have some models (for example, action-recognition or levit) with small values of batch and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like your idea. In third-party brgemm heuristics, I saw that minimal allowed |
||
if (big_m_dim) { | ||
std::tie(batch_m_dim, new_m_dim) = split_minimize_kernel_wa(batch_dim, m_dim, optimal_parallelism_work_amount); | ||
if (split_is_done()) | ||
return true; | ||
} | ||
if (batch_dim < optimal_parallelism_work_amount) { | ||
std::tie(batch_m_dim, new_m_dim) = split_conservatively_increase_parallel_wa(batch_dim, m_dim, optimal_parallelism_work_amount); | ||
} | ||
return split_is_done(); | ||
} | ||
|
||
void SplitDimensionM::reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In second case in
compute_ideal_cases_heuristic
min_kernel_m
is 64 while this is 32 here.What's about to use always 64 and set as const static attribute of the class?
Or is there difference between heuristics and we really need to have smaller
min_kernel_m
in aggressive?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that it's better to have one
min_kernel_m
value. But I think that it should be 32, not 64. 64 value was set to empirically avoid the cases in which external repacking feature doesn't work, and overheads on repacking duplication inside kernel are bigger than benefits from the splitting. If external repacking works (and it seems like it will work in all cases after tokenization adjustments), we can easily lowermin_kernel_m
forcompute_ideal_cases_heuristic