Merge pull request #107 from mdevolde/command_line

jxmorris12 · web-flow · commit ea7494c73728 · 2025-03-31T11:23:47.000-07:00
Command line improvements
diff --git a/language_tool_python/__main__.py b/language_tool_python/__main__.py
@@ -200,10 +200,12 @@ def main() -> int:
 
                     # Messages that end with punctuation already include the
                     # suggestion.
-                    if replacement_text and not message.endswith(('.', '?')):
-                        message += '; suggestions: ' + replacement_text
+                    if replacement_text and not message.endswith('?'):
+                        message += ' Suggestions: ' + replacement_text
+                    
+                    line, column = match.get_line_and_column(text)
 
-                    print(f'{filename}: {rule_id}: {message}')
+                    print(f'{filename}:{line}:{column}: {rule_id}: {message}')
 
                     status = 2
         except LanguageToolError as exception:
diff --git a/language_tool_python/match.py b/language_tool_python/match.py
@@ -1,6 +1,6 @@
 import unicodedata
 from collections import OrderedDict
-from typing import Any, Dict, Iterator, OrderedDict as OrderedDictType
+from typing import Any, Dict, Tuple, Iterator, OrderedDict as OrderedDictType, List, Optional
 from functools import total_ordering
 
 def get_match_ordered_dict() -> OrderedDictType[str, type]:
@@ -58,21 +58,29 @@ def auto_type(obj: Any) -> Any:
         except ValueError:
             return obj
 
-""" Sample match JSON:
-    {
-        'message': 'Possible spelling mistake found.', 
-        'shortMessage': 'Spelling mistake', 
-        'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}], 
-        'offset': 8, 
-        'length': 4, 
-        'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.', 
-        'type': {'typeName': 'Other'}, 
-        'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}}, 
-        'ignoreForIncompleteSentence': False, 
-        'contextForSureMatch': 0
-    }
-
-"""
+def four_byte_char_positions(text: str) -> List[int]:
+    """
+    Identify positions of 4-byte encoded characters in a UTF-8 string.
+    This function scans through the input text and identifies the positions
+    of characters that are encoded with 4 bytes in UTF-8. These characters
+    are typically non-BMP (Basic Multilingual Plane) characters, such as
+    certain emoji and some rare Chinese, Japanese, and Korean characters.
+
+    :param text: The input string to be analyzed.
+    :type text: str
+    :return: A list of positions where 4-byte encoded characters are found.
+    :rtype: List[int]
+    """
+    positions = []
+    char_index = 0
+    for char in text:
+        if len(char.encode('utf-8')) == 4:
+            positions.append(char_index)
+            # Adding 1 to the index because 4 byte characters are
+            # 2 bytes in length in LanguageTool, instead of 1 byte in Python.
+            char_index += 1
+        char_index += 1
+    return positions
 
 @total_ordering
 class Match:
@@ -92,8 +100,12 @@ class Match:
 
                        - 'message': The message describing the error.
     :type attrib: Dict[str, Any]
+    :param text: The original text in which the error occurred (the whole text, not just the context).
+    :type text: str
 
     Attributes:
+        PREVIOUS_MATCHES_TEXT (Optional[str]): The text of the previous match object.
+        FOUR_BYTES_POSITIONS (Optional[List[int]]): The positions of 4-byte encoded characters in the text, registered by the previous match object (kept for optimization purposes if the text is the same).
         ruleId (str): The ID of the rule that was violated.
         message (str): The message describing the error.
         replacements (list): A list of suggested replacements for the error.
@@ -103,13 +115,40 @@ class Match:
         errorLength (int): The length of the error.
         category (str): The category of the rule that was violated.
         ruleIssueType (str): The issue type of the rule that was violated.
+
+    Exemple of a match object received from the LanguageTool API :
+    
+    ```
+    {
+        'message': 'Possible spelling mistake found.', 
+        'shortMessage': 'Spelling mistake', 
+        'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}], 
+        'offset': 8, 
+        'length': 4, 
+        'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.', 
+        'type': {'typeName': 'Other'}, 
+        'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}}, 
+        'ignoreForIncompleteSentence': False, 
+        'contextForSureMatch': 0
+    }
+    ```
     """
+
+    PREVIOUS_MATCHES_TEXT: Optional[str] = None
+    FOUR_BYTES_POSITIONS: Optional[List[int]] = None
     
-    def __init__(self, attrib: Dict[str, Any]) -> None:
+    def __init__(self, attrib: Dict[str, Any], text: str) -> None:
         """
         Initialize a Match object with the given attributes.
         The method processes and normalizes the attributes before storing them on the object.
+        This method adjusts the positions of 4-byte encoded characters in the text
+        to ensure the offsets of the matches are correct.
         """
+        if text is None:
+            raise ValueError("The text parameter must not be None")
+        elif not isinstance(text, str):
+            raise TypeError("The text parameter must be a string")
+        
         # Process rule.
         attrib['category'] = attrib['rule']['category']['id']
         attrib['ruleId'] = attrib['rule']['id']
@@ -127,6 +166,13 @@ def __init__(self, attrib: Dict[str, Any]) -> None:
         # Store objects on self.
         for k, v in attrib.items():
             setattr(self, k, v)
+        
+        if Match.PREVIOUS_MATCHES_TEXT != text:
+            Match.PREVIOUS_MATCHES_TEXT = text
+            Match.FOUR_BYTES_POSITIONS = four_byte_char_positions(text)
+        # Get the positions of 4-byte encoded characters in the text because without 
+        # carrying out this step, the offsets of the matches could be incorrect.
+        self.offset -= sum(1 for pos in Match.FOUR_BYTES_POSITIONS if pos < self.offset)
 
     def __repr__(self) -> str:
         """
@@ -185,6 +231,23 @@ def matchedText(self) -> str:
         :rtype: str
         """
         return self.context[self.offsetInContext:self.offsetInContext+self.errorLength]
+
+    def get_line_and_column(self, original_text: str) -> Tuple[int, int]:
+        """
+        Returns the line and column number of the error in the context.
+
+        :param original_text: The original text in which the error occurred. We need this to calculate the line and column number, because the context has no more newline characters.
+        :type original_text: str
+        :return: A tuple containing the line and column number of the error.
+        :rtype: Tuple[int, int]
+        """
+
+        context_without_additions = self.context[3:-3] if len(self.context) > 6 else self.context
+        if context_without_additions not in original_text.replace('\n', ' '):
+            raise ValueError('The original text does not match the context of the error')
+        line = original_text.count('\n', 0, self.offset)
+        column = self.offset - original_text.rfind('\n', 0, self.offset)
+        return line + 1, column
     
     def select_replacement(self, index: int) -> None:
         """
diff --git a/language_tool_python/server.py b/language_tool_python/server.py
@@ -273,7 +273,7 @@ def check(self, text: str) -> List[Match]:
         url = urllib.parse.urljoin(self._url, 'check')
         response = self._query_server(url, self._create_params(text))
         matches = response['matches']
-        return [Match(match) for match in matches]
+        return [Match(match, text) for match in matches]
 
     def _create_params(self, text: str) -> Dict[str, str]:
         """
diff --git a/language_tool_python/utils.py b/language_tool_python/utils.py
@@ -88,37 +88,9 @@ def parse_url(url_str: str) -> str:
     return urllib.parse.urlparse(url_str).geturl()
 
 
-def _4_bytes_encoded_positions(text: str) -> List[int]:
-    """
-    Identify positions of 4-byte encoded characters in a UTF-8 string.
-    This function scans through the input text and identifies the positions
-    of characters that are encoded with 4 bytes in UTF-8. These characters
-    are typically non-BMP (Basic Multilingual Plane) characters, such as
-    certain emoji and some rare Chinese, Japanese, and Korean characters.
-
-    :param text: The input string to be analyzed.
-    :type text: str
-    :return: A list of positions where 4-byte encoded characters are found.
-    :rtype: List[int]
-    """
-    positions = []
-    char_index = 0
-    for char in text:
-        if len(char.encode('utf-8')) == 4:
-            positions.append(char_index)
-            # Adding 1 to the index because 4 byte characters are
-            # 2 bytes in length in LanguageTool, instead of 1 byte in Python.
-            char_index += 1
-        char_index += 1
-    return positions
-
-
 def correct(text: str, matches: List[Match]) -> str:
     """
     Corrects the given text based on the provided matches.
-    This function adjusts the positions of 4-byte encoded characters in the text
-    to ensure the offsets of the matches are correct. It then applies the corrections
-    specified in the matches to the text.
     Only the first replacement for each match is applied to the text.
 
     :param text: The original text to be corrected.
@@ -128,10 +100,6 @@ def correct(text: str, matches: List[Match]) -> str:
     :return: The corrected text.
     :rtype: str
     """
-    # Get the positions of 4-byte encoded characters in the text because without 
-    # carrying out this step, the offsets of the matches could be incorrect.
-    for match in matches:
-        match.offset -= sum(1 for i in _4_bytes_encoded_positions(text) if i <= match.offset)
     ltext = list(text)
     matches = [match for match in matches if match.replacements]
     errors = [ltext[match.offset:match.offset + match.errorLength]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "language_tool_python"
-version = "2.9.1"
+version = "2.9.2"
 requires-python = ">=3.9"
 description = "Checks grammar using LanguageTool."
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -25,6 +25,9 @@ dev = [
     "pytest-runner"
 ]
 
+[project.scripts]
+language_tool_python = "language_tool_python.__main__:main"
+
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"