From 19471980da3acc89efe04664667b7369733528c2 Mon Sep 17 00:00:00 2001 From: merefield Date: Tue, 11 Jun 2024 19:26:14 +0100 Subject: [PATCH] IMPROVE: legalise URLs found in raw contents of Posts --- lib/discourse_chatbot/bots/open_ai_bot_rag.rb | 5 ++- .../functions/forum_search_function.rb | 33 +++++++++++++++++++ plugin.rb | 5 ++- .../functions/forum_search_function_spec.rb | 11 +++++-- 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/lib/discourse_chatbot/bots/open_ai_bot_rag.rb b/lib/discourse_chatbot/bots/open_ai_bot_rag.rb index 00c36e5..5caca1e 100644 --- a/lib/discourse_chatbot/bots/open_ai_bot_rag.rb +++ b/lib/discourse_chatbot/bots/open_ai_bot_rag.rb @@ -269,13 +269,12 @@ def call_function(func_name, args_str, opts) def legal_urls?(res, post_ids_found, topic_ids_found) return true if res.blank? - post_url_regex = %r{\/t/[^/]+/(\d+)/(\d+)} - topic_url_regex = %r{\/t/[^/]+/(\d+)(?!\d|\/)} + post_url_regex = ::DiscourseChatbot::POST_URL_REGEX + topic_url_regex = ::DiscourseChatbot::TOPIC_URL_REGEX topic_ids_in_text = res.scan(topic_url_regex).flatten post_combos_in_text = res.scan(post_url_regex) - topic_ids_in_text.each do |topic_id_in_text| if !topic_ids_found.include?(topic_id_in_text.to_i) return false diff --git a/lib/discourse_chatbot/functions/forum_search_function.rb b/lib/discourse_chatbot/functions/forum_search_function.rb index 13b59cc..a8a2f1b 100644 --- a/lib/discourse_chatbot/functions/forum_search_function.rb +++ b/lib/discourse_chatbot/functions/forum_search_function.rb @@ -101,6 +101,12 @@ def process(args) break if post.nil? next if post.deleted_at || !accepted_post_types.include?(post.post_type) response += I18n.t("chatbot.prompt.function.forum_search.answer.topic.each.post", post_number: post_number, username: post.user.username, date: post.created_at, raw: post.raw) + + topic_ids_in_raw_urls_found, post_ids_in_raw_urls_found = find_post_and_topic_ids_from_raw_urls(post.raw) + + topic_ids_found = topic_ids_found | topic_ids_in_raw_urls_found + post_ids_found = post_ids_found | post_ids_in_raw_urls_found + post_ids_found << post.id post_number += 1 end @@ -115,6 +121,12 @@ def process(args) username = User.find(current_post.user_id).username date = current_post.created_at.to_date response += I18n.t("chatbot.prompt.function.forum_search.answer.post.each", url: url, username: username, date: date, raw: raw, score: score, rank: index + 1) + + topic_ids_in_raw_urls_found, post_ids_in_raw_urls_found = find_post_and_topic_ids_from_raw_urls(raw) + + topic_ids_found = topic_ids_found | topic_ids_in_raw_urls_found + post_ids_found = post_ids_found | post_ids_in_raw_urls_found + post_ids_found << current_post.id end end @@ -124,5 +136,26 @@ def process(args) { result: I18n.t("chatbot.prompt.function.forum_search.error", query: args[parameters[0][:name]]), topic_ids_found: [], post_ids_found: [] } end end + + def find_post_and_topic_ids_from_raw_urls(raw) + post_ids_found = [] + + topic_ids_in_raw_topic_links = raw.scan(::DiscourseChatbot::TOPIC_URL_REGEX).flatten + topic_ids_found = topic_ids_in_raw_topic_links.map(&:to_i) + + post_combos_in_raw_post_links = raw.scan(::DiscourseChatbot::POST_URL_REGEX) + + post_combos_in_raw_post_links.each do |post_combo| + topic_id_in_text = post_combo[0] + post_number_in_text = post_combo[1] + + post = ::Post.find_by(topic_id: topic_id_in_text.to_i, post_number: post_number_in_text.to_i) + + post_ids_found << post.id + topic_ids_found << post.topic_id + end + + return topic_ids_found, post_ids_found + end end end diff --git a/plugin.rb b/plugin.rb index c413667..b1ba547 100644 --- a/plugin.rb +++ b/plugin.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true # name: discourse-chatbot # about: a plugin that allows you to have a conversation with a configurable chatbot in Discourse Chat, Topics and Private Messages -# version: 0.9.29 +# version: 0.9.30 # authors: merefield # url: https://github.com/merefield/discourse-chatbot @@ -34,6 +34,9 @@ module ::DiscourseChatbot EMBEDDING_PROCESS_POSTS_CHUNK = 300 + TOPIC_URL_REGEX = %r{\/t/[^/]+/(\d+)(?!\d|\/)} + POST_URL_REGEX = %r{\/t/[^/]+/(\d+)/(\d+)(?!\d|\/)} + def progress_debug_message(message) puts "Chatbot: #{message}" if SiteSetting.chatbot_enable_verbose_console_logging Rails.logger.info("Chatbot: #{message}") if SiteSetting.chatbot_enable_verbose_rails_logging diff --git a/spec/lib/functions/forum_search_function_spec.rb b/spec/lib/functions/forum_search_function_spec.rb index 6372379..a70cd5a 100644 --- a/spec/lib/functions/forum_search_function_spec.rb +++ b/spec/lib/functions/forum_search_function_spec.rb @@ -8,7 +8,7 @@ let(:post_3) { Fabricate(:post, topic: topic_1, raw: "on the plain", post_number: 3) } let(:post_4) { Fabricate(:post, topic: topic_1, raw: "or so they say!", post_number: 4) } let(:topic_2) { Fabricate(:topic, title: "weather in northern Europe") } - let(:post_5) { Fabricate(:post, topic: topic_2, raw: "rains everywhere", post_number: 1) } + let(:post_5) { Fabricate(:post, topic: topic_2, raw: "rains everywhere https://example.com/t/slug/#{post_2.topic_id}/#{post_2.post_number} ", post_number: 1) } let(:topic_3) { Fabricate(:topic, title: "nothing to do with the weather")} let(:post_6) { Fabricate(:post, topic: topic_3, raw: "cars go fast", post_number: 1) } @@ -42,10 +42,11 @@ expect(topic_1).not_to be_nil expect(topic_2).not_to be_nil expect(topic_3).not_to be_nil - expect(subject.process(args)[:topic_ids_found]).to eq([]) + expect(subject.process(args)[:topic_ids_found]).to eq([post_2.topic_id]) expect(subject.process(args)[:post_ids_found]).to include(post_5.id) expect(subject.process(args)[:post_ids_found]).to include(post_3.id) - expect(subject.process(args)[:post_ids_found]).not_to include(post_2.id) + expect(subject.process(args)[:post_ids_found]).to include(post_2.id) + expect(subject.process(args)[:post_ids_found]).not_to include(post_4.id) expect(subject.process(args)[:result]).to include(post_3.raw) end @@ -71,4 +72,8 @@ expect(subject.process(args)[:result]).not_to include(topic_3.title) expect(subject.process(args)[:result]).not_to include(post_4.raw) end + + it "finds urls with a post id" do + expect(subject.find_post_and_topic_ids_from_raw_urls(post_5.raw)).to eq([[post_2.topic_id], [post_2.id]]) + end end \ No newline at end of file