Context pruning uses line numbers (sweepai#2757)

# Purpose Please provide a high-level overview of what this pull request aims to achieve. # Changes Made Please provide a detailed list of the changes made in this pull request. 1. 2. 3. # Additional Notes Please provide any additional notes or screenshots here. When you make a PR, please ping us on Discord at http://discord.gg/sweep. --------- Co-authored-by: wwzeng1 <william@sweep.dev>
NordcomInc · Dec 13, 2023 · bb084a7 · bb084a7
1 parent 5d2923a
commit bb084a7
Show file tree

Hide file tree

Showing 7 changed files with 362 additions and 165 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -20,8 +20,8 @@
             "console":"integratedTerminal",
             "justMyCode":true,
             "args": [
-                "https://github.com/sweepai/ui/issues/2"
-                // "https://github.com/sweepai/sweep/issues/2669"
+                // "https://github.com/sweepai/sweep/issues/2738",
+                "https://github.com/sweepai/sweep/issues/2758"
             ]
         },
         {

diff --git a/sweepai/core/context_pruning.py b/sweepai/core/context_pruning.py
diff --git a/sweepai/core/prompts.py b/sweepai/core/prompts.py
@@ -201,14 +201,15 @@
 # Plan:
 <plan>
 <create file="file_path_1" relevant_files="space-separated list of ALL files relevant for creating file_path_1">
-* Instructions for creating the new file needed to solve the issue
-* Include references to all files, imports and entity names
+* Natural language instructions for creating the new file needed to solve the issue.
+* Reference necessary files, imports and entity names.
 ...
 </create>
 ...
 
-<modify file="file_path_2" relevant_files="space-separated list of ALL files relevant for modifying file_path_2">
-* Instructions for the modifications needed to solve the issue. Be concise and mention references to all files, imports and entity names.
+<modify file="file_path_2" start_line="i" end_line="j" relevant_files="space-separated list of ALL files relevant for modifying file_path_2">
+* Natural language instructions for the modifications needed to solve the issue.
+* Be concise and reference necessary files, imports and entity names.
 ...
 </modify>
 ...

diff --git a/sweepai/utils/code_tree.py b/sweepai/utils/code_tree.py
@@ -1,4 +1,5 @@
 import ast
+import re
 
 import tree_sitter_languages
 from pydantic import BaseModel
@@ -7,17 +8,18 @@
 
 class CodeTree(BaseModel):
     code: str
+    language: str
     tree: Tree
 
     class Config:
         arbitrary_types_allowed = True
 
     @classmethod
-    def from_code(cls, code: str):
+    def from_code(cls, code: str, language: str = "python"):
         parser = Parser()
-        parser.set_language(tree_sitter_languages.get_language("python"))
+        parser.set_language(tree_sitter_languages.get_language(language))
         tree = parser.parse(bytes(code, "utf8"))
-        return cls(code=code, tree=tree)
+        return cls(code=code, language=language, tree=tree)
 
     def get_path_to_line(self, min_line: int, max_line: int = -1) -> list[Node]:
         if max_line == -1:
@@ -66,6 +68,68 @@ def get_lines_surrounding(
         else:
             return (min_line, max_line)
 
+    def get_preview(self, min_line: int = 5, max_line: int = 1200):
+        last_end_line = -1
+        lines = self.code.splitlines()
+        def get_children(node: Node = self.tree.root_node):
+            nonlocal last_end_line
+            children = []
+            for child in node.children:
+                start_line, _ = child.start_point
+                end_line, _ = child.end_point
+                if start_line <= last_end_line:
+                    continue
+                text = "\n".join(lines[start_line : end_line + 1])
+                indentation = " " * (len(text) - len(text.lstrip()))
+                for i in range(last_end_line + 1, start_line):
+                    line = lines[i]
+                    children.append(f"{i} | {line}")
+                    last_end_line = i
+                if end_line - start_line > max_line:
+                    children.extend(get_children(child))
+                elif end_line - start_line < min_line:
+                    text = "\n".join(
+                        [
+                            f"{start_line + i} | {line}"
+                            for i, line in enumerate(text.split("\n"))
+                        ]
+                    )
+                    children.append(text)
+                else:
+                    node_lines = text.split("\n")
+                    first_line = node_lines[0]
+                    first_line = f"{start_line} | {first_line}"
+                    second_line = node_lines[1]
+                    second_line = f"{start_line + 1} | {second_line}"
+                    hidden_lines_content = "\n".join(lines[start_line + 2 : end_line - 1])
+                    number_of_terms = 5
+                    first_n_terms = ", ".join(extract_words(hidden_lines_content)[:number_of_terms])
+                    spacing = " " * (len(str(start_line)) + 2)
+                    middle_lines = spacing.join(
+                        [
+                        spacing + indentation + f"     ...\n",
+                        indentation + f"     (lines {start_line + 1}-{end_line - 1} contains terms: {first_n_terms}\n",
+                        indentation + f"     ...\n",
+                        ]
+                    )
+                    second_last_line = node_lines[-2]
+                    second_last_line = f"{end_line - 1} | {second_last_line}"
+                    last_line = node_lines[-1]
+                    last_line = f"{end_line} | {last_line}"
+                    children.append(first_line)
+                    children.append(second_line)
+                    children.append(middle_lines)
+                    children.append(second_last_line)
+                    children.append(last_line)
+                last_end_line = end_line
+            return children
+        return "\n".join(get_children())
+
+
+def extract_words(string):
+    # extract the most common words from a code snippet
+    words = re.findall(r"\w+", string)
+    return list(dict.fromkeys(words))
 
 def get_global_function_names_and_spans(node):
     return [
@@ -87,11 +151,33 @@ def test_check_comments_presence_with_unsupported_file_extension(self, mock_spli
 from unittest.mock import patch
 from sweepai.utils.comment_utils import check_comments_presence
 
+def helper():
+    x = 1
+    y = 2
+    z = 3
+    return x + y + z
+
 class TestCheckCommentsPresence(unittest.TestCase):
 
     @patch('os.path.splitext')
     def test_check_comments_presence_with_comment(self, mock_splitext):
         mock_splitext.return_value = ('file', '.py')
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
+        x = 1
         self.assertEqual(check_comments_presence('file.py', '# This is a comment'), True)
 
     @patch('os.path.splitext')
@@ -112,9 +198,13 @@ def test_check_comments_presence_with_empty_new_code(self, mock_splitext):
 if __name__ == '__main__':
     unittest.main()
 """
-    split_code = full_code.split("\n")
-    match_start = 16
-    match_end = 20
-    code_tree = CodeTree.from_code(full_code)
-    print(code_tree.get_lines_surrounding(match_start)[0])
-    print(code_tree.get_lines_surrounding(match_end)[1])
+    # split_code = full_code.split("\n")
+    file_contents = open("sweepai/handlers/on_ticket.py").read()
+    # file_contents = full_code
+    # match_start = 16
+    # match_end = 20
+    code_tree = CodeTree.from_code(file_contents)
+    print(code_tree.get_preview())
+    print(len(code_tree.get_preview().split("\n")))
+    # print(code_tree.get_lines_surrounding(match_start)[0])
+    # print(code_tree.get_lines_surrounding(match_end)[1])
diff --git a/sweepai/utils/str_utils.py b/sweepai/utils/str_utils.py
@@ -88,3 +88,8 @@ def clean_logs(logs: str):
 def extract_lines(text: str, start: int, end: int):
     lines = text.splitlines(keepends=True)
     return "\n".join(lines[max(0, start) : min(len(lines), end)])
+
+
+def add_line_numbers(text: str, start: int = 0):
+    lines = text.splitlines(keepends=True)
+    return "".join(f"{start + i} | {line}" for i, line in enumerate(lines))
diff --git a/sweepai/utils/ticket_utils.py b/sweepai/utils/ticket_utils.py
@@ -1,5 +1,5 @@
-from threading import Thread
 import traceback
+from threading import Thread
 from time import time
 
 from loguru import logger
@@ -8,7 +8,10 @@
 from sweepai.core.context_pruning import RepoContextManager, get_relevant_context
 from sweepai.core.entities import Snippet
 from sweepai.core.lexical_search import search_index
-from sweepai.core.vector_db import compute_vector_search_scores, prepare_lexical_search_index
+from sweepai.core.vector_db import (
+    compute_vector_search_scores,
+    prepare_lexical_search_index,
+)
 from sweepai.logn.cache import file_cache
 from sweepai.utils.chat_logger import discord_log_error
 from sweepai.utils.event_logger import posthog
@@ -47,9 +50,13 @@ def prep_snippets(
         codebase_score = files_to_scores.get(snippet.file_path, 0.08)
         snippet_score = 0.1
         if snippet_to_key(snippet) in content_to_lexical_score:
-            snippet_score = content_to_lexical_score[snippet_to_key(snippet)] * codebase_score
+            snippet_score = (
+                content_to_lexical_score[snippet_to_key(snippet)] * codebase_score
+            )
         else:
-            content_to_lexical_score[snippet_to_key(snippet)] = snippet_score * codebase_score
+            content_to_lexical_score[snippet_to_key(snippet)] = (
+                snippet_score * codebase_score
+            )
 
     ranked_snippets = sorted(
         snippets,
@@ -79,6 +86,7 @@ def prep_snippets(
         current_top_snippets=ranked_snippets,
         snippets=snippets,
         snippet_scores=content_to_lexical_score,
+        cloned_repo=cloned_repo,
     )
     return repo_context_manager
 
@@ -239,19 +247,22 @@ def log_error(
 def center(text: str) -> str:
     return f"<div align='center'>{text}</div>"
 
+
 def fire_and_forget_wrapper(call):
     """
     This decorator is used to run a function in a separate thread.
     It does not return anything and does not wait for the function to finish.
     It fails silently.
     """
+
     def wrapper(*args, **kwargs):
         def run_in_thread(call, *a, **kw):
             try:
                 call(*a, **kw)
             except:
                 pass
+
         thread = Thread(target=run_in_thread, args=(call,) + args, kwargs=kwargs)
         thread.start()
 
-    return wrapper
+    return wrapper
diff --git a/sweepai/utils/tree_utils.py b/sweepai/utils/tree_utils.py
@@ -35,11 +35,11 @@ def __repr__(self):
 
 class DirectoryTree:
     def __init__(self):
-        self.original_lines = []
-        self.lines = []
+        self.original_lines: list[Line] = []
+        self.lines: list[Line] = []
 
-    def parse(self, input_str):
-        stack = []  # To keep track of parent directories
+    def parse(self, input_str: str):
+        stack: list[Line] = []  # To keep track of parent directories
         for line in input_str.strip().split("\n"):
             indent_count = (len(line) - len(line.lstrip())) // 2
             line = line.strip()
@@ -152,6 +152,8 @@ def remove_multiple(self, targets):
             self.remove(target)
 
     def __str__(self):
-        return "\n".join(
-            ("  " * line.indent_count) + line.full_path() for line in self.lines
-        )
+        results = []
+        for line in self.lines:
+            line_text = line.text.split("/")[-2] + "/" if line.is_dir else line.text
+            results.append(("  " * line.indent_count) + line_text)
+        return "\n".join(results)