From 0436ae9c656135d63330bb845379d9a1924a443c Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Thu, 28 Sep 2023 11:07:38 -0600 Subject: [PATCH 01/11] update file-level --- cli.py | 63 ++++++++++--------- devtale/utils.py | 158 ++++++++++++++++++++++++++++------------------- 2 files changed, 128 insertions(+), 93 deletions(-) diff --git a/cli.py b/cli.py index 5084e27..877d223 100644 --- a/cli.py +++ b/cli.py @@ -7,12 +7,6 @@ import click from dotenv import load_dotenv -from devtale.aggregators import ( - GoAggregator, - JavascriptAggregator, - PHPAggregator, - PythonAggregator, -) from devtale.constants import ( ALLOWED_EXTENSIONS, ALLOWED_NO_CODE_EXTENSIONS, @@ -22,6 +16,7 @@ from devtale.utils import ( build_project_tree, extract_code_elements, + fuse_documentation, fuse_tales, get_unit_tale, prepare_code_elements, @@ -306,15 +301,20 @@ def process_file( debug: bool = False, cost_estimation: bool = False, ) -> None: + """It creates a dev tale for the file input.""" cost = 0 file_name = os.path.basename(file_path) file_ext = os.path.splitext(file_name)[-1] save_path = os.path.join(output_path, f"{file_name}.json") + # For the debugging mode we do not want to process the file, we only want + # to verify input. Useful to verify the repository/directories flow. if debug: logger.debug(f"FILE INFO:\nfile_path: {file_path}\nsave_path: {save_path}") return {"file_docstring": "-"}, cost + # Create output dir if it does not exists and only if we are not + # pre-estimating the cost. if not os.path.exists(output_path) and not cost_estimation: os.makedirs(output_path) @@ -322,9 +322,13 @@ def process_file( with open(file_path, "r") as file: code = file.read() + # Return empty devtale if the input file is empty if not code: return {"file_docstring": ""}, cost + # Avoid processing a file twice if we already have a tale for it. + # Only fuse it again. Useful to avoid GPT calls in case of debugging + # aggregators. if os.path.exists(save_path): logger.info(f"Skipping {file_name} as its tale file already exists.") with open(save_path, "r") as file: @@ -333,9 +337,12 @@ def process_file( fuse_documentation(code, found_tale, output_path, file_name, file_ext) return found_tale, cost + # For config/bash files we do not aim to document the file itself. We + # care about understanding what the file does. if not file_ext or file_ext in ALLOWED_NO_CODE_EXTENSIONS: # a small single chunk is enough no_code_file = split_text(code, chunk_size=5000)[0].page_content + # prepare input no_code_file_data = { "file_name": file_name, "file_content": no_code_file, @@ -350,6 +357,10 @@ def process_file( return {"file_docstring": file_docstring}, cost + # big_docs reduces the number of GPT-4 calls as we want to extract + # functions/classes names, while short_docs allows GPT-4 to focus in + # a more granular context to accurately generate the docstring for each + # function/class that it found. logger.info("split dev draft ideas") big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000) short_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=3000) @@ -364,13 +375,16 @@ def process_file( if elements_set: code_elements.append(elements_set) + # Combine all the code elements extracted into a single general Dict + # without duplicates. logger.info("prepare code elements") code_elements_dict = prepare_code_elements(code_elements) # Make a copy to keep the original dict intact code_elements_copy = copy.deepcopy(code_elements_dict) - # clean + # Clean dict copy to remove keys with empty values and the summaries + # of each code chunk. code_elements_copy.pop("summary", None) if not code_elements_copy["classes"]: code_elements_copy.pop("classes", None) @@ -379,7 +393,8 @@ def process_file( logger.info("create tale sections") tales_list = [] - # process only if we have elements to document + # Generate a docstring for each class and function/method in the + # code_elements if code_elements_copy or cost_estimation: for idx, doc in enumerate(short_docs): tale, call_cost = get_unit_tale( @@ -392,15 +407,20 @@ def process_file( tales_list.append(tale) logger.info(f"tale section {str(idx+1)}/{len(short_docs)} done.") + # Combine all generated docstrings JSON-formated ouputs into a single, + # general one. logger.info("create dev tale") tale, errors = fuse_tales(tales_list, code, code_elements_dict) + # Check if we discarded some docstrings if len(errors) > 0: logger.info( f"We encountered errors while fusing the following \ tales for {file_name} - Corrupted tales: {errors}" ) + # Generate a top-level docstrings using as context all the summaries we got + # from each big_doc code chunk output logger.info("add dev tale summary") summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000) @@ -412,15 +432,18 @@ def process_file( ) cost += call_cost + # Add the docstrings in the code file if fuse and not cost_estimation: - # add docstring label only to insert it along the docstring into the code + # add devtale label into the top-file summary tale["file_docstring"] = DOCSTRING_LABEL + "\n" + file_docstring - fuse_documentation(code, tale, output_path, file_name, file_ext) + fused_save_path = os.path.join(output_path, file_name) + logger.info(f"save fused dev tale in: {fused_save_path}") + fuse_documentation(code, tale, file_ext, save_path=fused_save_path) + # remove devtale label tale["file_docstring"] = file_docstring logger.info(f"save dev tale in: {save_path}") - if not cost_estimation: with open(save_path, "w") as json_file: json.dump(tale, json_file, indent=2) @@ -428,24 +451,6 @@ def process_file( return tale, cost -def fuse_documentation(code, tale, output_path, file_name, file_ext): - save_path = os.path.join(output_path, file_name) - logger.info(f"save fused dev tale in: {save_path}") - - if file_ext == ".py": - aggregator = PythonAggregator() - elif file_ext == ".php": - aggregator = PHPAggregator() - elif file_ext == ".go": - aggregator = GoAggregator() - elif file_ext == ".js" or file_ext == ".ts" or file_ext == ".tsx": - aggregator = JavascriptAggregator() - - fused_tale = aggregator.document(code=code, documentation=tale) - with open(save_path, "w") as file: - file.write(fused_tale) - - @click.command() @click.option( "-p", diff --git a/devtale/utils.py b/devtale/utils.py index ec7dd8e..12e6345 100644 --- a/devtale/utils.py +++ b/devtale/utils.py @@ -11,6 +11,12 @@ from langchain.output_parsers import PydanticOutputParser from langchain.text_splitter import RecursiveCharacterTextSplitter +from devtale.aggregators import ( + GoAggregator, + JavascriptAggregator, + PHPAggregator, + PythonAggregator, +) from devtale.constants import DOCSTRING_LABEL, GPT_PRICE from devtale.schema import FileDocumentation from devtale.templates import ( @@ -82,7 +88,25 @@ def extract_code_elements( return result_string["text"], cost +def prepare_code_elements(code_elements): + """Convert GPT text output into a dictionary and combine each + dictionary into a single, general one + """ + elements = {"classes": [], "methods": [], "summary": []} + for code_element in code_elements: + info = _process_extracted_code_element(code_element) + elements["classes"].extend(info["classes"]) + elements["methods"].extend(info["methods"]) + elements["summary"].append(info["summary"]) + + # remove duplicates + elements["classes"] = list(set(elements["classes"])) + elements["methods"] = list(set(elements["methods"])) + return elements + + def _process_extracted_code_element(text: str): + """It converts GPT text output into a dictionary of code elements""" classes_match = re.search(r"classes=(\[.*?\])", text) methods_match = re.search(r"methods=(\[.*?\])", text) summary_match = re.search(r'summary="([^"]*)"', text) @@ -105,18 +129,62 @@ def _process_extracted_code_element(text: str): return {"classes": classes, "methods": methods, "summary": summary} -def prepare_code_elements(code_elements): - elements = {"classes": [], "methods": [], "summary": []} - for code_element in code_elements: - info = _process_extracted_code_element(code_element) - elements["classes"].extend(info["classes"]) - elements["methods"].extend(info["methods"]) - elements["summary"].append(info["summary"]) +def fuse_tales(tales_list, code, code_elements_dict): + """Combine all the generated docstrings JSON-formatted GPT outputs into + a single one, remove hallucinations and duplicates. + """ + fused_tale = {"classes": [], "methods": []} + errors = [] + unique_methods = set() + unique_classes = set() - # remove duplicates - elements["classes"] = list(set(elements["classes"])) - elements["methods"] = list(set(elements["methods"])) - return elements + for tale in tales_list: + if "classes" in tale: + for class_info in tale["classes"]: + if isinstance(class_info, dict): + class_name = class_info["class_name"] + if class_name not in unique_classes and not _is_hallucination( + class_name, code, code_elements_dict["classes"] + ): + unique_classes.add(class_name) + # Attach the devtale label on each docstring + class_info["class_docstring"] = ( + DOCSTRING_LABEL + "\n" + class_info["class_docstring"] + ) + fused_tale["classes"].append(class_info) + else: + if tale not in errors: + errors.append(tale) + + if "methods" in tale: + for method_info in tale["methods"]: + if isinstance(method_info, dict): + method_name = method_info["method_name"] + if method_name not in unique_methods and not _is_hallucination( + method_name, code, code_elements_dict["methods"] + ): + unique_methods.add(method_name) + # Attach the dectale label on each docstring + method_info["method_docstring"] = ( + DOCSTRING_LABEL + "\n" + method_info["method_docstring"] + ) + fused_tale["methods"].append(method_info) + else: + if tale not in errors: + errors.append(tale) + + return fused_tale, errors + + +def _is_hallucination(code_definition, code, expected_definitions): + # Verify that the code_definition is expected + if code_definition not in expected_definitions: + return True + + # Check if the code_definition exists within the code + if not re.search(r"\b" + re.escape(code_definition) + r"\b", code): + return True + return False def redact_tale_information( @@ -210,59 +278,6 @@ def get_unit_tale( return json_answer, cost -def is_hallucination(code_definition, code, expected_definitions): - # Verify that the code_definition is expected - if code_definition not in expected_definitions: - return True - - # Check if the code_definition exists within the code - if not re.search(r"\b" + re.escape(code_definition) + r"\b", code): - return True - return False - - -def fuse_tales(tales_list, code, code_elements_dict): - fused_tale = {"classes": [], "methods": []} - errors = [] - unique_methods = set() - unique_classes = set() - - for tale in tales_list: - if "classes" in tale: - for class_info in tale["classes"]: - if isinstance(class_info, dict): - class_name = class_info["class_name"] - if class_name not in unique_classes and not is_hallucination( - class_name, code, code_elements_dict["classes"] - ): - unique_classes.add(class_name) - class_info["class_docstring"] = ( - DOCSTRING_LABEL + "\n" + class_info["class_docstring"] - ) - fused_tale["classes"].append(class_info) - else: - if tale not in errors: - errors.append(tale) - - if "methods" in tale: - for method_info in tale["methods"]: - if isinstance(method_info, dict): - method_name = method_info["method_name"] - if method_name not in unique_methods and not is_hallucination( - method_name, code, code_elements_dict["methods"] - ): - unique_methods.add(method_name) - method_info["method_docstring"] = ( - DOCSTRING_LABEL + "\n" + method_info["method_docstring"] - ) - fused_tale["methods"].append(method_info) - else: - if tale not in errors: - errors.append(tale) - - return fused_tale, errors - - def _add_escape_characters(invalid_json): control_char_pattern = re.compile(r"[\x00-\x1F\x7F-\x9F]") unescaped_chars = control_char_pattern.findall(invalid_json) @@ -306,3 +321,18 @@ def build_project_tree(root_dir, indent="", gitignore_patterns=None): file_paths.append(item_path) return tree, file_paths + + +def fuse_documentation(code, tale, file_ext, save_path): + if file_ext == ".py": + aggregator = PythonAggregator() + elif file_ext == ".php": + aggregator = PHPAggregator() + elif file_ext == ".go": + aggregator = GoAggregator() + elif file_ext == ".js" or file_ext == ".ts" or file_ext == ".tsx": + aggregator = JavascriptAggregator() + + fused_tale = aggregator.document(code=code, documentation=tale) + with open(save_path, "w") as file: + file.write(fused_tale) From 2c90cf2f1d1eb025810945745720ba613f5af338 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Thu, 28 Sep 2023 11:09:24 -0600 Subject: [PATCH 02/11] fix comments format --- cli.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cli.py b/cli.py index 877d223..cc21a6b 100644 --- a/cli.py +++ b/cli.py @@ -322,7 +322,7 @@ def process_file( with open(file_path, "r") as file: code = file.read() - # Return empty devtale if the input file is empty + # Return empty devtale if the input file is empty. if not code: return {"file_docstring": ""}, cost @@ -380,7 +380,7 @@ def process_file( logger.info("prepare code elements") code_elements_dict = prepare_code_elements(code_elements) - # Make a copy to keep the original dict intact + # Make a copy to keep the original dict intact. code_elements_copy = copy.deepcopy(code_elements_dict) # Clean dict copy to remove keys with empty values and the summaries @@ -394,7 +394,7 @@ def process_file( logger.info("create tale sections") tales_list = [] # Generate a docstring for each class and function/method in the - # code_elements + # code_elements. if code_elements_copy or cost_estimation: for idx, doc in enumerate(short_docs): tale, call_cost = get_unit_tale( @@ -412,7 +412,7 @@ def process_file( logger.info("create dev tale") tale, errors = fuse_tales(tales_list, code, code_elements_dict) - # Check if we discarded some docstrings + # Check if we discarded some docstrings. if len(errors) > 0: logger.info( f"We encountered errors while fusing the following \ @@ -420,7 +420,7 @@ def process_file( ) # Generate a top-level docstrings using as context all the summaries we got - # from each big_doc code chunk output + # from each big_doc code chunk output. logger.info("add dev tale summary") summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000) @@ -432,15 +432,15 @@ def process_file( ) cost += call_cost - # Add the docstrings in the code file + # Add the docstrings in the code file. if fuse and not cost_estimation: - # add devtale label into the top-file summary + # add devtale label into the top-file summary. tale["file_docstring"] = DOCSTRING_LABEL + "\n" + file_docstring fused_save_path = os.path.join(output_path, file_name) logger.info(f"save fused dev tale in: {fused_save_path}") fuse_documentation(code, tale, file_ext, save_path=fused_save_path) - # remove devtale label + # Remove devtale label. tale["file_docstring"] = file_docstring logger.info(f"save dev tale in: {save_path}") From 048052b4492117442580c30bf42651618af45ef4 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Thu, 28 Sep 2023 11:55:55 -0600 Subject: [PATCH 03/11] update folder-level --- cli.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/cli.py b/cli.py index cc21a6b..9746c4d 100644 --- a/cli.py +++ b/cli.py @@ -184,18 +184,24 @@ def process_folder( folder_full_name: str = None, cost_estimation: bool = False, ) -> None: + """It creates a dev tale for each file in the directory without exploring + subdirectories, and it generates a README section for the folder. + """ cost = 0 save_path = os.path.join(output_path, os.path.basename(folder_path)) tales = [] + # Iterate through each file in the folder for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) + # Check it if is a file that we need to process if os.path.isfile(file_path) and ( os.path.splitext(file_name)[1] in ALLOWED_EXTENSIONS or os.path.splitext(file_name)[1] in ALLOWED_NO_CODE_EXTENSIONS ): logger.info(f"processing {file_path}") + # Create dev tale for the file try: file_tale, file_cost = process_file( file_path, save_path, model_name, fuse, debug, cost_estimation @@ -207,6 +213,8 @@ def process_folder( ) file_tale = None + # Create a dictionary with the tale's file_docstrings values to use them + # as context for the folder's README section if file_tale is not None: if file_tale["file_docstring"]: if not folder_full_name: @@ -214,9 +222,13 @@ def process_folder( os.path.abspath(folder_path) ) + # If this is a root folder, make its name more aesthetic. if folder_full_name == ".": folder_full_name = "./" + # Check if we already have the folder_name as key, if yes, then + # append the file_docstring on it. Useful when working in a + # repository level. folder_entry = next( ( item @@ -230,7 +242,8 @@ def process_folder( "folder_name": folder_full_name, "folder_files": [], } - if folder_full_name == ".": + # Add a generic description in case this is a root directory. + if folder_full_name == "./": folder_entry[ "folder_description" ] = """ @@ -247,28 +260,33 @@ def process_folder( } ) + # For the debugging mode we do not want to generate the folder's README + # section. We only want to verify the input flow. if debug: - logger.debug( - f"""FOLDER INFO: - folder_path: {folder_path} - output_path: {output_path} - save_path: {save_path} - """ - ) + logger.debug(f"FOLDER INFO: folder_path: {folder_path}") + logger.debug(f"FOLDER INFO: output_path: {output_path}") + logger.debug(f"FOLDER INFO: save_path: {save_path}") logger.debug(f"FILE_TALES: {tales}") return "-", "-", cost if tales: + # Generate the folder's README section using as context the tales summaries. files_summaries = split_text(str(tales), chunk_size=10000) - # split into two calls to avoid issues with json decoding markdow text. folder_readme, fl_cost = redact_tale_information( "folder-level", files_summaries, model_name="gpt-3.5-turbo-16k", cost_estimation=cost_estimation, ) + + # Because of the template, GPT might also add the line separator, so we need + # to clean folder_readme = folder_readme.replace("----------", "") + # Generate a folder one-line description using the folder's readme as context. + # This is a separated call to avoid issues with json attempting to decode + # markdown text, and its porpuse is to be used as context for the repository + # mode. folder_overview, fd_cost = redact_tale_information( "folder-description", folder_readme, @@ -278,6 +296,7 @@ def process_folder( cost += fl_cost + fd_cost + # save folder tale if we are not pre-estimating cost. if not cost_estimation: logger.info("save folder json..") with open(os.path.join(save_path, "folder_level.json"), "w") as json_file: From 4021f01c5ff5ef3964e6628ace83538341c09b12 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Thu, 28 Sep 2023 12:49:14 -0600 Subject: [PATCH 04/11] update repository-level --- cli.py | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/cli.py b/cli.py index 9746c4d..583c92e 100644 --- a/cli.py +++ b/cli.py @@ -41,13 +41,16 @@ def process_repository( debug: bool = False, cost_estimation: bool = False, ) -> None: + """It creates a dev tale for each file in the repository, and it + generates a README for the whole repository. + """ cost = 0 folder_tales = { "repository_name": os.path.basename(os.path.abspath(root_path)), "folders": [], } - # get original readme before creating a new one + # Extract the content of the original README if there is one already. original_readme_content = None for file_name in ["readme.md", "README.md"]: readme_path = os.path.join(root_path, file_name) @@ -61,7 +64,8 @@ def process_repository( logger.info(f"Error keeping the original readme file: {e}") break - # get project structure before we modify it + # Check if we have a gitignore file to extract the correct project tree + # and files. gitignore_path = os.path.join(root_path, ".gitignore") if os.path.exists(gitignore_path): with open(gitignore_path, "r") as gitignore_file: @@ -71,22 +75,33 @@ def process_repository( else: gitignore_patterns = None + # Get the project tree before modify it along with the complete list of files + # that the repository has. project_tree, file_paths = build_project_tree( root_path, gitignore_patterns=gitignore_patterns ) project_tree = ".\n" + project_tree + # Extract the folder paths from files list. This allows to avoid processing + # folders that should be ignored, and to use the process_folder logic. folders = list(set([os.path.dirname(file_path) for file_path in file_paths])) + + # sort to always have the root folder at the beggining of the list. folders = sorted(folders, key=lambda path: path.count("/")) + # Get the folder's README section of each folder while it create a dev tale + # for each file. folders_readmes = [] for folder_path in folders: try: + # Fix folder path to avoid issues with file system. if not folder_path.endswith("/"): folder_path += "/" folder_full_name = os.path.relpath(folder_path, root_path) + # Generate folder's README, folder's one-line sentence description, and + # extract the cost of documenting the folder. folder_readme, folder_tale, folder_cost = process_folder( folder_path=folder_path, output_path=os.path.join(output_path, folder_full_name) @@ -107,9 +122,11 @@ def process_repository( ) folder_tale = None + # Create a dictionary with the folder's info that serves as context for + # generating the main repository README. if folder_tale: folders_readmes.append("\n\n" + folder_readme) - # add root folder summary information + # Fix root folder information. if folder_path == folders[0]: folder_tales["folders"].append( { @@ -126,11 +143,13 @@ def process_repository( } ) + # For debugging, we only care in seeing the files input workflow if debug: logger.debug(f"FOLDER_TALES: {folder_tales}") return None if folder_tales: + # Generate main README using as context the folders summaries. folder_summaries = split_text(str(folder_tales), chunk_size=15000) root_readme, call_cost = redact_tale_information( "root-level", @@ -139,18 +158,21 @@ def process_repository( cost_estimation=cost_estimation, ) cost += call_cost + + # Because of the template, GPT might also add the line separator, so we need + # to clean. root_readme = root_readme.replace("----------", "") - # inject folders information + # Append the folders README sections. if folders_readmes: folders_information = "\n\n## Folders" + "".join(folders_readmes) root_readme = root_readme + folders_information - # inject project tree + # Append the project tree. tree = f"\n\n## Project Tree\n```bash\n{project_tree}```\n\n" root_readme = root_readme + tree - # inject original readme if there is one + # Append the original readme content as extra notes, removing the header. if original_readme_content: filtered_original_readme = [ line for line in original_readme_content if not line.startswith("# ") @@ -158,9 +180,9 @@ def process_repository( modified_original_readme = "\n\n## Extra notes\n\n" + "".join( filtered_original_readme ) - root_readme = root_readme + modified_original_readme + # save main README if we are not pre-estimating cost. if not cost_estimation: logger.info("save root json..") with open(os.path.join(output_path, "root_level.json"), "w") as json_file: From fb26eea52916e7da9ecb090a0db74e8717d16764 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Thu, 28 Sep 2023 13:11:20 -0600 Subject: [PATCH 05/11] clean --- cli.py | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/cli.py b/cli.py index 583c92e..7dc88fc 100644 --- a/cli.py +++ b/cli.py @@ -26,8 +26,6 @@ ) DEFAULT_OUTPUT_PATH = "devtale_demo/" -DEFAULT_MODEL_NAME = "gpt-4" - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -36,7 +34,6 @@ def process_repository( root_path: str, output_path: str = DEFAULT_OUTPUT_PATH, - model_name: str = DEFAULT_MODEL_NAME, fuse: bool = False, debug: bool = False, cost_estimation: bool = False, @@ -107,7 +104,6 @@ def process_repository( output_path=os.path.join(output_path, folder_full_name) if folder_full_name != "." else output_path, - model_name=model_name, fuse=fuse, debug=debug, folder_full_name=folder_full_name, @@ -200,7 +196,6 @@ def process_repository( def process_folder( folder_path: str, output_path: str = DEFAULT_OUTPUT_PATH, - model_name: str = DEFAULT_MODEL_NAME, fuse: bool = False, debug: bool = False, folder_full_name: str = None, @@ -226,7 +221,7 @@ def process_folder( # Create dev tale for the file try: file_tale, file_cost = process_file( - file_path, save_path, model_name, fuse, debug, cost_estimation + file_path, save_path, fuse, debug, cost_estimation ) cost += file_cost except Exception as e: @@ -337,7 +332,6 @@ def process_folder( def process_file( file_path: str, output_path: str = DEFAULT_OUTPUT_PATH, - model_name: str = DEFAULT_MODEL_NAME, fuse: bool = False, debug: bool = False, cost_estimation: bool = False, @@ -375,7 +369,12 @@ def process_file( with open(save_path, "r") as file: found_tale = json.load(file) if fuse: - fuse_documentation(code, found_tale, output_path, file_name, file_ext) + fuse_documentation( + code=code, + tale=found_tale, + file_ext=file_ext, + save_path=os.path.join(output_path, file_name), + ) return found_tale, cost # For config/bash files we do not aim to document the file itself. We @@ -410,7 +409,7 @@ def process_file( code_elements = [] for idx, doc in enumerate(big_docs): elements_set, call_cost = extract_code_elements( - big_doc=doc, model_name=model_name, cost_estimation=cost_estimation + big_doc=doc, model_name="gpt-4", cost_estimation=cost_estimation ) cost += call_cost if elements_set: @@ -441,7 +440,7 @@ def process_file( tale, call_cost = get_unit_tale( short_doc=doc, code_elements=code_elements_copy, - model_name=model_name, + model_name="gpt-4", cost_estimation=cost_estimation, ) cost += call_cost @@ -522,37 +521,28 @@ def process_file( "output_path", required=False, default=DEFAULT_OUTPUT_PATH, - help="The destination folder where you want to save the documentation outputs", -) -@click.option( - "-n", - "--model-name", - "model_name", - required=False, - default=DEFAULT_MODEL_NAME, - help="The OpenAI model name you want to use. \ - https://platform.openai.com/docs/models", + help="The destination folder where you want to save the documentation outputs. \ + Default: devtale_demo/", ) @click.option( "--debug", "debug", is_flag=True, default=False, - help="Mock answer and avoid GPT calls", + help="Mock answers avoiding any GPT call.", ) @click.option( "--estimation", "cost_estimation", is_flag=True, default=False, - help="When true, estimate the cost of openAI's API usage, without making any call", + help="When true, estimate the cost of openAI's API usage, without making any call.", ) def main( path: str, recursive: bool, fuse: bool, output_path: str = DEFAULT_OUTPUT_PATH, - model_name: str = DEFAULT_MODEL_NAME, debug: bool = False, cost_estimation: bool = False, ): @@ -569,7 +559,6 @@ def main( price = process_repository( root_path=path, output_path=output_path, - model_name=model_name, fuse=fuse, debug=debug, cost_estimation=cost_estimation, @@ -579,7 +568,6 @@ def main( _, _, price = process_folder( folder_path=path, output_path=output_path, - model_name=model_name, fuse=fuse, debug=debug, cost_estimation=cost_estimation, @@ -589,7 +577,6 @@ def main( _, price = process_file( file_path=path, output_path=output_path, - model_name=model_name, fuse=fuse, debug=debug, cost_estimation=cost_estimation, From e684743f00a9a27a472b4f5e850f508d4905081c Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Thu, 28 Sep 2023 13:33:08 -0600 Subject: [PATCH 06/11] update action --- README.md | 3 ++- action.yml | 26 +++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f0eba83..bd29578 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # devtale -Every software product depends on some legacy, undocumented code repositories, whose authors left the company years ago. +Every software product depends on some legacy, undocumented code repositories, whose authors left the company years ago. Who isn't afraid to make a change, if the code is unreadable? @@ -60,6 +60,7 @@ jobs: path: ${{ github.workspace }} recursive: true target_branch: main + save_tales: false ``` The `recursive` option allows you to document the entire repository. Alternatively, you can specify a specific path to document a single file or folder and set `recursive` to `false`. The workflow action will automatically create the `devtale/documentation` branch and push a new pull request for your review towards the `target_branch`, including the added documentation. diff --git a/action.yml b/action.yml index 9535024..a9bb9ae 100644 --- a/action.yml +++ b/action.yml @@ -6,18 +6,22 @@ branding: inputs: openai_api_key: - description: "Your OpenAI API key" + description: "Your OpenAI API key." required: true path: description: "Path to your repository, folder, or file." required: true recursive: - description: "True if you want to document the full repository. Otherwise False" + description: "True if you want to document the full repository. Otherwise False." required: false default: false target_branch: - description: "Branch name for the documentation pull request." + description: "Base branch name to which the documentation pull request should point." required: true + save_tales: + description: "True if you want to keep the tale files. Otherwise False to remove them." + required: false + default: false runs: using: "composite" @@ -48,12 +52,16 @@ runs: - name: Clean Documentation Files run: | - rm -f *.py.json - rm -f *.php.json - rm -f *.go.json - rm -f *.js.json - rm -f *folder_level.json - rm -f *root_level.json + if ! ${{ inputs.save_tales }}; then + rm -f *.py.json + rm -f *.php.json + rm -f *.go.json + rm -f *.js.json + rm -f *.ts.json + rm -f *.tsx.json + rm -f *folder_level.json + rm -f *root_level.json + fi shell: bash - name: Push PR From e8d3f0dfa833e82333e6e043f9d785afb3f4ac99 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Thu, 28 Sep 2023 13:58:50 -0600 Subject: [PATCH 07/11] update utils --- cli.py | 4 +- devtale/utils.py | 290 +++++++++++++++++++++++------------------------ 2 files changed, 147 insertions(+), 147 deletions(-) diff --git a/cli.py b/cli.py index 7dc88fc..69d5479 100644 --- a/cli.py +++ b/cli.py @@ -17,7 +17,7 @@ build_project_tree, extract_code_elements, fuse_documentation, - fuse_tales, + fuse_tales_chunks, get_unit_tale, prepare_code_elements, redact_tale_information, @@ -450,7 +450,7 @@ def process_file( # Combine all generated docstrings JSON-formated ouputs into a single, # general one. logger.info("create dev tale") - tale, errors = fuse_tales(tales_list, code, code_elements_dict) + tale, errors = fuse_tales_chunks(tales_list, code, code_elements_dict) # Check if we discarded some docstrings. if len(errors) > 0: diff --git a/devtale/utils.py b/devtale/utils.py index 12e6345..74fdd1b 100644 --- a/devtale/utils.py +++ b/devtale/utils.py @@ -38,16 +38,6 @@ } -def calculate_cost(input: str, model: str): - if model == "text-davinci-003": - encoding = "p50k_base" - else: - encoding = "cl100k_base" - - tokens = tiktoken.get_encoding(encoding).encode(input) - return (len(tokens) / 1000) * GPT_PRICE[model] - - def split_text(text, chunk_size=1000, chunk_overlap=0): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap @@ -76,7 +66,7 @@ def extract_code_elements( ) if cost_estimation: - estimated_cost = calculate_cost( + estimated_cost = _calculate_cost( prompt.format(code=big_doc.page_content), model_name ) return "", estimated_cost @@ -88,6 +78,72 @@ def extract_code_elements( return result_string["text"], cost +def get_unit_tale( + short_doc, code_elements, model_name="gpt-4", verbose=False, cost_estimation=False +): + parser = PydanticOutputParser(pydantic_object=FileDocumentation) + prompt = PromptTemplate( + template=CODE_LEVEL_TEMPLATE, + input_variables=["code", "code_elements"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) + teller_of_tales = LLMChain( + llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose + ) + + if cost_estimation: + estimated_cost = _calculate_cost( + prompt.format( + code=short_doc.page_content, code_elements=str(code_elements) + ), + model_name, + ) + return {"classes": [], "methods": []}, estimated_cost + + with get_openai_callback() as cb: + result_string = teller_of_tales( + {"code": short_doc.page_content, "code_elements": code_elements} + ) + cost = cb.total_cost + + json_answer = _convert_to_json(result_string) + if not json_answer: + print("Returning empty JSON due to a failure") + json_answer = {"classes": [], "methods": []} + return json_answer, cost + + +def redact_tale_information( + content_type, + docs, + verbose=False, + model_name="text-davinci-003", + cost_estimation=False, +): + prompt = PromptTemplate( + template=TYPE_INFORMATION[content_type], input_variables=["information"] + ) + teller_of_tales = LLMChain( + llm=OpenAI(model_name=model_name), prompt=prompt, verbose=verbose + ) + if content_type not in ["no-code-file", "folder-description"]: + information = str(docs[0].page_content) + else: + information = str(docs) + + if cost_estimation: + estimated_cost = _calculate_cost( + prompt.format(information=information), model_name + ) + return "", estimated_cost + + with get_openai_callback() as cb: + text_answer = teller_of_tales({"information": information}) + cost = cb.total_cost + + return text_answer["text"], cost + + def prepare_code_elements(code_elements): """Convert GPT text output into a dictionary and combine each dictionary into a single, general one @@ -105,31 +161,7 @@ def prepare_code_elements(code_elements): return elements -def _process_extracted_code_element(text: str): - """It converts GPT text output into a dictionary of code elements""" - classes_match = re.search(r"classes=(\[.*?\])", text) - methods_match = re.search(r"methods=(\[.*?\])", text) - summary_match = re.search(r'summary="([^"]*)"', text) - - classes = [] - methods = [] - summary = "" - - if classes_match: - classes_str = classes_match.group(1) - classes = re.findall(r'"(.*?)"', classes_str) - - if methods_match: - methods_str = methods_match.group(1) - methods = re.findall(r'"(.*?)"', methods_str) - - if summary_match: - summary = summary_match.group(1) - - return {"classes": classes, "methods": methods, "summary": summary} - - -def fuse_tales(tales_list, code, code_elements_dict): +def fuse_tales_chunks(tales_list, code, code_elements_dict): """Combine all the generated docstrings JSON-formatted GPT outputs into a single one, remove hallucinations and duplicates. """ @@ -176,49 +208,58 @@ def fuse_tales(tales_list, code, code_elements_dict): return fused_tale, errors -def _is_hallucination(code_definition, code, expected_definitions): - # Verify that the code_definition is expected - if code_definition not in expected_definitions: - return True +def build_project_tree(root_dir, indent="", gitignore_patterns=None): + if gitignore_patterns is None: + gitignore_patterns = [] - # Check if the code_definition exists within the code - if not re.search(r"\b" + re.escape(code_definition) + r"\b", code): - return True - return False + tree = "" + items = [item for item in os.listdir(root_dir) if not item.startswith(".")] + file_paths = [] + for item in sorted(items): + item_path = os.path.join(root_dir, item) + if _should_ignore(item_path, gitignore_patterns): + continue + if os.path.isdir(item_path): + tree += indent + "├── " + item + "\n" + subtree, subfile_paths = build_project_tree( + item_path, indent + "│ ", gitignore_patterns + ) + tree += subtree + file_paths.extend(subfile_paths) + else: + tree += indent + "└── " + item + "\n" + file_paths.append(item_path) -def redact_tale_information( - content_type, - docs, - verbose=False, - model_name="text-davinci-003", - cost_estimation=False, -): - prompt = PromptTemplate( - template=TYPE_INFORMATION[content_type], input_variables=["information"] - ) - teller_of_tales = LLMChain( - llm=OpenAI(model_name=model_name), prompt=prompt, verbose=verbose - ) - if content_type not in ["no-code-file", "folder-description"]: - information = str(docs[0].page_content) - else: - information = str(docs) + return tree, file_paths - if cost_estimation: - estimated_cost = calculate_cost( - prompt.format(information=information), model_name - ) - return "", estimated_cost - with get_openai_callback() as cb: - text_answer = teller_of_tales({"information": information}) - cost = cb.total_cost +def fuse_documentation(code, tale, file_ext, save_path): + if file_ext == ".py": + aggregator = PythonAggregator() + elif file_ext == ".php": + aggregator = PHPAggregator() + elif file_ext == ".go": + aggregator = GoAggregator() + elif file_ext == ".js" or file_ext == ".ts" or file_ext == ".tsx": + aggregator = JavascriptAggregator() - return text_answer["text"], cost + fused_tale = aggregator.document(code=code, documentation=tale) + with open(save_path, "w") as file: + file.write(fused_tale) -def convert_to_json(text_answer): +def _calculate_cost(input: str, model: str): + if model == "text-davinci-003": + encoding = "p50k_base" + else: + encoding = "cl100k_base" + + tokens = tiktoken.get_encoding(encoding).encode(input) + return (len(tokens) / 1000) * GPT_PRICE[model] + + +def _convert_to_json(text_answer): try: result_json = json.loads(text_answer["text"]) return result_json @@ -243,41 +284,6 @@ def convert_to_json(text_answer): return None -def get_unit_tale( - short_doc, code_elements, model_name="gpt-4", verbose=False, cost_estimation=False -): - parser = PydanticOutputParser(pydantic_object=FileDocumentation) - prompt = PromptTemplate( - template=CODE_LEVEL_TEMPLATE, - input_variables=["code", "code_elements"], - partial_variables={"format_instructions": parser.get_format_instructions()}, - ) - teller_of_tales = LLMChain( - llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose - ) - - if cost_estimation: - estimated_cost = calculate_cost( - prompt.format( - code=short_doc.page_content, code_elements=str(code_elements) - ), - model_name, - ) - return {"classes": [], "methods": []}, estimated_cost - - with get_openai_callback() as cb: - result_string = teller_of_tales( - {"code": short_doc.page_content, "code_elements": code_elements} - ) - cost = cb.total_cost - - json_answer = convert_to_json(result_string) - if not json_answer: - print("Returning empty JSON due to a failure") - json_answer = {"classes": [], "methods": []} - return json_answer, cost - - def _add_escape_characters(invalid_json): control_char_pattern = re.compile(r"[\x00-\x1F\x7F-\x9F]") unescaped_chars = control_char_pattern.findall(invalid_json) @@ -289,50 +295,44 @@ def _add_escape_characters(invalid_json): return invalid_json -def _should_ignore(path, gitignore_patterns): - path = Path(path) - for pattern in gitignore_patterns: - if path.match(pattern) or any(p.match(pattern) for p in path.parents): - return True - return False +def _process_extracted_code_element(text: str): + """It converts GPT text output into a dictionary of code elements""" + classes_match = re.search(r"classes=(\[.*?\])", text) + methods_match = re.search(r"methods=(\[.*?\])", text) + summary_match = re.search(r'summary="([^"]*)"', text) + classes = [] + methods = [] + summary = "" -def build_project_tree(root_dir, indent="", gitignore_patterns=None): - if gitignore_patterns is None: - gitignore_patterns = [] + if classes_match: + classes_str = classes_match.group(1) + classes = re.findall(r'"(.*?)"', classes_str) - tree = "" - items = [item for item in os.listdir(root_dir) if not item.startswith(".")] - file_paths = [] + if methods_match: + methods_str = methods_match.group(1) + methods = re.findall(r'"(.*?)"', methods_str) - for item in sorted(items): - item_path = os.path.join(root_dir, item) - if _should_ignore(item_path, gitignore_patterns): - continue - if os.path.isdir(item_path): - tree += indent + "├── " + item + "\n" - subtree, subfile_paths = build_project_tree( - item_path, indent + "│ ", gitignore_patterns - ) - tree += subtree - file_paths.extend(subfile_paths) - else: - tree += indent + "└── " + item + "\n" - file_paths.append(item_path) + if summary_match: + summary = summary_match.group(1) - return tree, file_paths + return {"classes": classes, "methods": methods, "summary": summary} -def fuse_documentation(code, tale, file_ext, save_path): - if file_ext == ".py": - aggregator = PythonAggregator() - elif file_ext == ".php": - aggregator = PHPAggregator() - elif file_ext == ".go": - aggregator = GoAggregator() - elif file_ext == ".js" or file_ext == ".ts" or file_ext == ".tsx": - aggregator = JavascriptAggregator() +def _is_hallucination(code_definition, code, expected_definitions): + # Verify that the code_definition is expected + if code_definition not in expected_definitions: + return True - fused_tale = aggregator.document(code=code, documentation=tale) - with open(save_path, "w") as file: - file.write(fused_tale) + # Check if the code_definition exists within the code + if not re.search(r"\b" + re.escape(code_definition) + r"\b", code): + return True + return False + + +def _should_ignore(path, gitignore_patterns): + path = Path(path) + for pattern in gitignore_patterns: + if path.match(pattern) or any(p.match(pattern) for p in path.parents): + return True + return False From 3250becb5d84797f3d10ab6e9226b733b7d95c19 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Fri, 29 Sep 2023 18:58:14 -0600 Subject: [PATCH 08/11] update javascript aggregator --- devtale/aggregators/javascript.py | 38 +++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/devtale/aggregators/javascript.py b/devtale/aggregators/javascript.py index 3ad8878..8e25ac6 100644 --- a/devtale/aggregators/javascript.py +++ b/devtale/aggregators/javascript.py @@ -11,6 +11,7 @@ def document(self, documentation, code): documented_code = self._add_docstrings( documentation, documented_code, type="methods" ) + documented_code = self._add_tsx_docstrings(documentation, documented_code) documented_code = self._add_docstrings( documentation, documented_code, type="classes" ) @@ -49,6 +50,41 @@ def _add_docstrings(self, documentation, code, type="methods"): for i, line in enumerate(lines): if re.findall(pattern, line, re.MULTILINE): if previous_line: + # Check if the function or class is already documented + if "*/" not in previous_line and "//" not in previous_line: + indentation = self._extract_indentation(line) + fixed_docstring = self._break_large_strings(docstring) + fixed_docstring = self._format_docstring( + fixed_docstring, indentation + ) + lines.insert(i, fixed_docstring) + break + elif line.strip(): + previous_line = line + + return "\n".join(lines) + + def _add_tsx_docstrings(self, documentation, code): + entities = documentation["methods"] + lines = code.splitlines() + previous_line = None + + for entity in entities: + name_to_search = entity["method_name"] + docstring = entity["method_docstring"] + + pattern = ( + r"" + + re.escape(name_to_search) + + "\s*=\s*(\(\s*\)\s*=>\s*{|\(\s*([^)]*)\s*\)\s*=>)|" + + re.escape(name_to_search) + + r"\(\)\s*=>\s*{\)" + ) + + for i, line in enumerate(lines): + if re.findall(pattern, line, re.MULTILINE): + if previous_line: + # Check if the function or class is already documented if "*/" not in previous_line and "//" not in previous_line: indentation = self._extract_indentation(line) fixed_docstring = self._break_large_strings(docstring) @@ -74,6 +110,7 @@ def _extract_indentation(self, code_line): return indentation def _format_docstring(self, docstring, indentation): + """It adds the in-line comment key character.""" lines = docstring.split("\n") js_docstring = "\n" + " " * indentation + "/*\n" for line in lines: @@ -84,6 +121,7 @@ def _format_docstring(self, docstring, indentation): def _document_file(self, documentation, code): file_description = self._break_large_strings(documentation["file_docstring"]) words = code.split() + # Check if the file already has a top-file docstring if words[0] != "//" and words[0] != "/*" and not words[0].startswith("/*"): code = "/*" + file_description + "*/\n" + code From 3539bb11a893aa585889d1e0bb13830bceeefdc8 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Sat, 30 Sep 2023 19:26:01 -0600 Subject: [PATCH 09/11] update go --- devtale/aggregators/go.py | 4 ++++ devtale/aggregators/javascript.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/devtale/aggregators/go.py b/devtale/aggregators/go.py index 066baad..ea53103 100644 --- a/devtale/aggregators/go.py +++ b/devtale/aggregators/go.py @@ -65,6 +65,9 @@ def _add_docstrings(self, documentation, code, type="method"): return documented_code def _break_large_strings(self, string, max_lenght=90): + """Avoid very long in-line comments by breaking them into smaller + segments with a maximum length. + """ words = string.replace("\\n", " \n ").split() lines = [] current_line = "" @@ -82,6 +85,7 @@ def _break_large_strings(self, string, max_lenght=90): return "\n".join(["// " + line for line in lines]) def _document_file(self, documentation, code): + """Add a top-level docstring if there isn't one already.""" file_description = self._break_large_strings(documentation["file_docstring"]) words = code.split() if words[0] != "//" and words[0] != "/*": diff --git a/devtale/aggregators/javascript.py b/devtale/aggregators/javascript.py index 8e25ac6..a8c0172 100644 --- a/devtale/aggregators/javascript.py +++ b/devtale/aggregators/javascript.py @@ -110,7 +110,7 @@ def _extract_indentation(self, code_line): return indentation def _format_docstring(self, docstring, indentation): - """It adds the in-line comment key character.""" + """Add the in-line comment character key""" lines = docstring.split("\n") js_docstring = "\n" + " " * indentation + "/*\n" for line in lines: @@ -119,6 +119,7 @@ def _format_docstring(self, docstring, indentation): return js_docstring def _document_file(self, documentation, code): + """Add a top-level docstring if there isn't one already.""" file_description = self._break_large_strings(documentation["file_docstring"]) words = code.split() # Check if the file already has a top-file docstring @@ -128,6 +129,9 @@ def _document_file(self, documentation, code): return code def _break_large_strings(self, string, max_lenght=90): + """Avoid very long in-line comments by breaking them into smaller + segments with a maximum length. + """ words = string.replace("\\n", " \n ").split() lines = [] current_line = "" From 3580f74619c6d0eaeb34dbecdad37ba7deb89504 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Sat, 30 Sep 2023 19:34:43 -0600 Subject: [PATCH 10/11] update php --- devtale/aggregators/php.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/devtale/aggregators/php.py b/devtale/aggregators/php.py index 4df7c4e..5961578 100644 --- a/devtale/aggregators/php.py +++ b/devtale/aggregators/php.py @@ -83,6 +83,7 @@ def _document_classes(self, documentation, code): return code def _format_docstring(self, docstring, indentation): + """Add the in-line comment character key""" lines = docstring.split("\n") php_docstring = "\n" + " " * indentation + "/**\n" for line in lines: @@ -109,6 +110,9 @@ def _extract_indentation(self, text, code_line): return indentation def _break_large_strings(self, string, max_lenght=90): + """Avoid very long in-line comments by breaking them into smaller + segments with a maximum length. + """ words = string.split() lines = [] current_line = "" From 0716b5879b23d82c8c09e97ba30f85dbce565963 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Sat, 30 Sep 2023 20:39:19 -0600 Subject: [PATCH 11/11] update python --- devtale/aggregators/python.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/devtale/aggregators/python.py b/devtale/aggregators/python.py index 0f31089..a644b19 100644 --- a/devtale/aggregators/python.py +++ b/devtale/aggregators/python.py @@ -29,19 +29,26 @@ def document(self, documentation, code): code_definitions = self._get_code_definitions(code_w_placeholders) documented_code = code + # For each function/method or class definition we found using AST, we match + # it with the dev tale info. for name, definition in code_definitions.items(): splited_definition = definition.split() - prefix = splited_definition[0] - postfix = splited_definition[-1] + prefix = splited_definition[0] # def, class + postfix = splited_definition[-1] # last text Eg. "->None", "):", etc type_item = "method" if prefix == "def" else "class" - if len(splited_definition) == 2: + # Extract only the last character if we have conflicting text that won't + # allow us to match the pattern. + if len(splited_definition) == 2 or "'" in postfix or '"' in postfix: postfix = postfix[-1] + pattern = r"" + prefix + "\s+" + name + "[\s\S]*?" + re.escape(postfix) docstring = self._get_docstring(type_item, name, documentation) - docstring = self._fix_docstring(docstring) + + # docstring = self._fix_docstring(docstring) + docstring = self._break_large_strings(docstring) comment = f'\n"""{docstring}"""' match = re.findall(pattern, documented_code) @@ -69,6 +76,7 @@ def document(self, documentation, code): return documented_code def _add_file_level_docstring(self, code: str, documentation): + """Add a top-level docstring if there isn't one already.""" file_description = self._break_large_strings(documentation["file_docstring"]) docstring = f'"""{file_description}\n"""\n' @@ -79,6 +87,11 @@ def _add_file_level_docstring(self, code: str, documentation): return code def _add_placeholders(self, code: str): + """AST is capable of adding docstrings to the code; however, it reformats + the file. To avoid this, we add a placeholder that we later search for in + the process. This helps us determine the location where the docstring + should be attached. + """ code_tree = ast.parse(code) placeholder_adder = Placeholder() modified_ast = placeholder_adder.visit(code_tree) @@ -87,6 +100,9 @@ def _add_placeholders(self, code: str): return modified_code def _get_code_definitions(self, code_w_placeholders): + """Search for the placeholder we added and extract the function/method or + class signature. + """ code_definitions = {} lines = code_w_placeholders.splitlines() @@ -139,6 +155,9 @@ def _extract_indentation(self, text, code_line): return indentation_size def _break_large_strings(self, string, max_lenght=90): + """Avoid very long in-line comments by breaking them into smaller + segments with a maximum length. + """ words = string.split() lines = [] current_line = "" @@ -153,7 +172,7 @@ def _break_large_strings(self, string, max_lenght=90): if current_line: lines.append(current_line) - return "\n".join(lines) + return "\n".join([line for line in lines]) def _fix_docstring(self, docstring): pattern = r"^(.*?)(?=Args:|Returns:|$)"