Skip to content

Commit ea7494c

Browse files
authored
Merge pull request #107 from mdevolde/command_line
Command line improvements
2 parents ecaabec + f961f96 commit ea7494c

File tree

5 files changed

+90
-54
lines changed

5 files changed

+90
-54
lines changed

language_tool_python/__main__.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -200,10 +200,12 @@ def main() -> int:
200200

201201
# Messages that end with punctuation already include the
202202
# suggestion.
203-
if replacement_text and not message.endswith(('.', '?')):
204-
message += '; suggestions: ' + replacement_text
203+
if replacement_text and not message.endswith('?'):
204+
message += ' Suggestions: ' + replacement_text
205+
206+
line, column = match.get_line_and_column(text)
205207

206-
print(f'{filename}: {rule_id}: {message}')
208+
print(f'{filename}:{line}:{column}: {rule_id}: {message}')
207209

208210
status = 2
209211
except LanguageToolError as exception:

language_tool_python/match.py

+80-17
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unicodedata
22
from collections import OrderedDict
3-
from typing import Any, Dict, Iterator, OrderedDict as OrderedDictType
3+
from typing import Any, Dict, Tuple, Iterator, OrderedDict as OrderedDictType, List, Optional
44
from functools import total_ordering
55

66
def get_match_ordered_dict() -> OrderedDictType[str, type]:
@@ -58,21 +58,29 @@ def auto_type(obj: Any) -> Any:
5858
except ValueError:
5959
return obj
6060

61-
""" Sample match JSON:
62-
{
63-
'message': 'Possible spelling mistake found.',
64-
'shortMessage': 'Spelling mistake',
65-
'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}],
66-
'offset': 8,
67-
'length': 4,
68-
'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.',
69-
'type': {'typeName': 'Other'},
70-
'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}},
71-
'ignoreForIncompleteSentence': False,
72-
'contextForSureMatch': 0
73-
}
74-
75-
"""
61+
def four_byte_char_positions(text: str) -> List[int]:
62+
"""
63+
Identify positions of 4-byte encoded characters in a UTF-8 string.
64+
This function scans through the input text and identifies the positions
65+
of characters that are encoded with 4 bytes in UTF-8. These characters
66+
are typically non-BMP (Basic Multilingual Plane) characters, such as
67+
certain emoji and some rare Chinese, Japanese, and Korean characters.
68+
69+
:param text: The input string to be analyzed.
70+
:type text: str
71+
:return: A list of positions where 4-byte encoded characters are found.
72+
:rtype: List[int]
73+
"""
74+
positions = []
75+
char_index = 0
76+
for char in text:
77+
if len(char.encode('utf-8')) == 4:
78+
positions.append(char_index)
79+
# Adding 1 to the index because 4 byte characters are
80+
# 2 bytes in length in LanguageTool, instead of 1 byte in Python.
81+
char_index += 1
82+
char_index += 1
83+
return positions
7684

7785
@total_ordering
7886
class Match:
@@ -92,8 +100,12 @@ class Match:
92100
93101
- 'message': The message describing the error.
94102
:type attrib: Dict[str, Any]
103+
:param text: The original text in which the error occurred (the whole text, not just the context).
104+
:type text: str
95105
96106
Attributes:
107+
PREVIOUS_MATCHES_TEXT (Optional[str]): The text of the previous match object.
108+
FOUR_BYTES_POSITIONS (Optional[List[int]]): The positions of 4-byte encoded characters in the text, registered by the previous match object (kept for optimization purposes if the text is the same).
97109
ruleId (str): The ID of the rule that was violated.
98110
message (str): The message describing the error.
99111
replacements (list): A list of suggested replacements for the error.
@@ -103,13 +115,40 @@ class Match:
103115
errorLength (int): The length of the error.
104116
category (str): The category of the rule that was violated.
105117
ruleIssueType (str): The issue type of the rule that was violated.
118+
119+
Exemple of a match object received from the LanguageTool API :
120+
121+
```
122+
{
123+
'message': 'Possible spelling mistake found.',
124+
'shortMessage': 'Spelling mistake',
125+
'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}],
126+
'offset': 8,
127+
'length': 4,
128+
'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.',
129+
'type': {'typeName': 'Other'},
130+
'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}},
131+
'ignoreForIncompleteSentence': False,
132+
'contextForSureMatch': 0
133+
}
134+
```
106135
"""
136+
137+
PREVIOUS_MATCHES_TEXT: Optional[str] = None
138+
FOUR_BYTES_POSITIONS: Optional[List[int]] = None
107139

108-
def __init__(self, attrib: Dict[str, Any]) -> None:
140+
def __init__(self, attrib: Dict[str, Any], text: str) -> None:
109141
"""
110142
Initialize a Match object with the given attributes.
111143
The method processes and normalizes the attributes before storing them on the object.
144+
This method adjusts the positions of 4-byte encoded characters in the text
145+
to ensure the offsets of the matches are correct.
112146
"""
147+
if text is None:
148+
raise ValueError("The text parameter must not be None")
149+
elif not isinstance(text, str):
150+
raise TypeError("The text parameter must be a string")
151+
113152
# Process rule.
114153
attrib['category'] = attrib['rule']['category']['id']
115154
attrib['ruleId'] = attrib['rule']['id']
@@ -127,6 +166,13 @@ def __init__(self, attrib: Dict[str, Any]) -> None:
127166
# Store objects on self.
128167
for k, v in attrib.items():
129168
setattr(self, k, v)
169+
170+
if Match.PREVIOUS_MATCHES_TEXT != text:
171+
Match.PREVIOUS_MATCHES_TEXT = text
172+
Match.FOUR_BYTES_POSITIONS = four_byte_char_positions(text)
173+
# Get the positions of 4-byte encoded characters in the text because without
174+
# carrying out this step, the offsets of the matches could be incorrect.
175+
self.offset -= sum(1 for pos in Match.FOUR_BYTES_POSITIONS if pos < self.offset)
130176

131177
def __repr__(self) -> str:
132178
"""
@@ -185,6 +231,23 @@ def matchedText(self) -> str:
185231
:rtype: str
186232
"""
187233
return self.context[self.offsetInContext:self.offsetInContext+self.errorLength]
234+
235+
def get_line_and_column(self, original_text: str) -> Tuple[int, int]:
236+
"""
237+
Returns the line and column number of the error in the context.
238+
239+
:param original_text: The original text in which the error occurred. We need this to calculate the line and column number, because the context has no more newline characters.
240+
:type original_text: str
241+
:return: A tuple containing the line and column number of the error.
242+
:rtype: Tuple[int, int]
243+
"""
244+
245+
context_without_additions = self.context[3:-3] if len(self.context) > 6 else self.context
246+
if context_without_additions not in original_text.replace('\n', ' '):
247+
raise ValueError('The original text does not match the context of the error')
248+
line = original_text.count('\n', 0, self.offset)
249+
column = self.offset - original_text.rfind('\n', 0, self.offset)
250+
return line + 1, column
188251

189252
def select_replacement(self, index: int) -> None:
190253
"""

language_tool_python/server.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def check(self, text: str) -> List[Match]:
273273
url = urllib.parse.urljoin(self._url, 'check')
274274
response = self._query_server(url, self._create_params(text))
275275
matches = response['matches']
276-
return [Match(match) for match in matches]
276+
return [Match(match, text) for match in matches]
277277

278278
def _create_params(self, text: str) -> Dict[str, str]:
279279
"""

language_tool_python/utils.py

-32
Original file line numberDiff line numberDiff line change
@@ -88,37 +88,9 @@ def parse_url(url_str: str) -> str:
8888
return urllib.parse.urlparse(url_str).geturl()
8989

9090

91-
def _4_bytes_encoded_positions(text: str) -> List[int]:
92-
"""
93-
Identify positions of 4-byte encoded characters in a UTF-8 string.
94-
This function scans through the input text and identifies the positions
95-
of characters that are encoded with 4 bytes in UTF-8. These characters
96-
are typically non-BMP (Basic Multilingual Plane) characters, such as
97-
certain emoji and some rare Chinese, Japanese, and Korean characters.
98-
99-
:param text: The input string to be analyzed.
100-
:type text: str
101-
:return: A list of positions where 4-byte encoded characters are found.
102-
:rtype: List[int]
103-
"""
104-
positions = []
105-
char_index = 0
106-
for char in text:
107-
if len(char.encode('utf-8')) == 4:
108-
positions.append(char_index)
109-
# Adding 1 to the index because 4 byte characters are
110-
# 2 bytes in length in LanguageTool, instead of 1 byte in Python.
111-
char_index += 1
112-
char_index += 1
113-
return positions
114-
115-
11691
def correct(text: str, matches: List[Match]) -> str:
11792
"""
11893
Corrects the given text based on the provided matches.
119-
This function adjusts the positions of 4-byte encoded characters in the text
120-
to ensure the offsets of the matches are correct. It then applies the corrections
121-
specified in the matches to the text.
12294
Only the first replacement for each match is applied to the text.
12395
12496
:param text: The original text to be corrected.
@@ -128,10 +100,6 @@ def correct(text: str, matches: List[Match]) -> str:
128100
:return: The corrected text.
129101
:rtype: str
130102
"""
131-
# Get the positions of 4-byte encoded characters in the text because without
132-
# carrying out this step, the offsets of the matches could be incorrect.
133-
for match in matches:
134-
match.offset -= sum(1 for i in _4_bytes_encoded_positions(text) if i <= match.offset)
135103
ltext = list(text)
136104
matches = [match for match in matches if match.replacements]
137105
errors = [ltext[match.offset:match.offset + match.errorLength]

pyproject.toml

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "language_tool_python"
3-
version = "2.9.1"
3+
version = "2.9.2"
44
requires-python = ">=3.9"
55
description = "Checks grammar using LanguageTool."
66
readme = { file = "README.md", content-type = "text/markdown" }
@@ -25,6 +25,9 @@ dev = [
2525
"pytest-runner"
2626
]
2727

28+
[project.scripts]
29+
language_tool_python = "language_tool_python.__main__:main"
30+
2831
[build-system]
2932
requires = ["setuptools>=61.0", "wheel"]
3033
build-backend = "setuptools.build_meta"

0 commit comments

Comments
 (0)