diff --git a/src/plugins/intel_cpu/src/nodes/rope.cpp b/src/plugins/intel_cpu/src/nodes/rope.cpp index 984b35237d93ba..1ae404b4dca692 100644 --- a/src/plugins/intel_cpu/src/nodes/rope.cpp +++ b/src/plugins/intel_cpu/src/nodes/rope.cpp @@ -334,11 +334,16 @@ struct RoPE::RoPEExecutorQwen : public RoPE::Executor { ov::intel_cpu::PlainTensor t_cos(inputs[1]); // [1, present-kv-length, 1, rotary_dims] ov::intel_cpu::PlainTensor t_sin(inputs[2]); // [1, present-kv-length, 1, rotary_dims] ov::intel_cpu::PlainTensor t_dst(outputs[0]); // [batch, length, head_cnt, head_size]> + ov::intel_cpu::PlainTensor gather; + auto rotary_dims = t_cos.size(3); if (m_config.slice_stop - m_config.slice_start > 0) { t_src = t_src.slice(2, m_config.slice_start, m_config.slice_stop); } + if (m_config.gather_position_arg_id > 0) { + gather.reset(inputs[m_config.gather_position_arg_id]); + } auto batch_size = t_src.size(0); auto seq_len = t_src.size(1); @@ -347,9 +352,19 @@ struct RoPE::RoPEExecutorQwen : public RoPE::Executor { auto present_kv_len = t_cos.size(1); parallel_for3d(batch_size, seq_len, head_cnt, [&](size_t b, size_t p, size_t h) { + size_t sincos_pos; + if (gather) { + if (gather.m_rank == 4) + sincos_pos = gather.at({b, h, p, 0}, true); + else + sincos_pos = gather.at({b, p}, true); + } else { + sincos_pos = present_kv_len - seq_len + p; + } + auto* src = t_src.ptr(b, p, h * head_size); - auto* cos = &t_cos.at({b, present_kv_len - seq_len + p, h, 0}, true); - auto* sin = &t_sin.at({b, present_kv_len - seq_len + p, h, 0}, true); + auto* cos = &t_cos.at({b, sincos_pos, h, 0}, true); + auto* sin = &t_sin.at({b, sincos_pos, h, 0}, true); auto* dst = t_dst.ptr(b, p, h); if (m_rotaryKernel) {