@@ -17814,7 +17814,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17814
17814
node->perf_time_us += time_us_cur;
17815
17815
}
17816
17816
17817
- static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17817
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads ) {
17818
17818
int n_tasks = 0;
17819
17819
17820
17820
switch (node->op) {
@@ -17899,7 +17899,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17899
17899
{
17900
17900
// FIXME: the cost of launching additional threads decreases performance with GPU offloading
17901
17901
//n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
17902
- n_tasks = 1 ;
17902
+ n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1])) ;
17903
17903
} break;
17904
17904
case GGML_OP_SCALE:
17905
17905
case GGML_OP_SET:
@@ -18125,7 +18125,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
18125
18125
/* FINALIZE */
18126
18126
struct ggml_tensor * node = cgraph->nodes[node_n];
18127
18127
if (GGML_OP_HAS_FINALIZE[node->op]) {
18128
- params.nth = ggml_get_n_tasks(node, n_threads);
18128
+ params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads );
18129
18129
ggml_compute_forward(¶ms, node);
18130
18130
}
18131
18131
ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -18135,7 +18135,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
18135
18135
while (++node_n < cgraph->n_nodes) {
18136
18136
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
18137
18137
struct ggml_tensor * node = cgraph->nodes[node_n];
18138
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18138
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads );
18139
18139
18140
18140
state->shared->perf_node_start_cycles = ggml_perf_cycles();
18141
18141
state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -18183,7 +18183,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
18183
18183
18184
18184
/* INIT & COMPUTE */
18185
18185
struct ggml_tensor * node = cgraph->nodes[node_n];
18186
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18186
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads );
18187
18187
18188
18188
struct ggml_compute_params params = {
18189
18189
/*.type =*/ GGML_TASK_TYPE_INIT,
@@ -18248,7 +18248,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18248
18248
for (int i = 0; i < cgraph->n_nodes; i++) {
18249
18249
struct ggml_tensor * node = cgraph->nodes[i];
18250
18250
18251
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18251
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, 1 );
18252
18252
18253
18253
max_tasks = MAX(max_tasks, n_tasks);
18254
18254
0 commit comments