-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add test for checking 404 urls * Add test to check duplicate slugs * Fix Wikipedia link for languages we only have a code * Fix hyperlinks * Add CI tests * Add supported version of sass-embedded * Add sass-embedded plugin * Downgrade sass-embedded plugin version * Remove sass-embedded gem * Change ruby version to 3.2.0 * Change html-proofer version * Change Omniscien id * Update htmlproofer * Change Omniscien id * Update .gitignore * Run generate.py * Change tests order * Change file paths * Add GitHub Metadata * Add repo name * Serve Jekyll in the background * Update requirements * Remove sleep * Update requirements * Update requirements * Update requirements. * update requirements * update requirements * update requirements * update requirements * Add update pytest run * Fix broken links * Fix json syntax and translation hub id * Run generate.py * Change Google id * Run generate.py * Change validation * Add comments * Change request logic with local lookup * Add link to GitHub Metadata issue solution * Slice codes list * Fix bug * Change phrase-mtqe to phrase-tms * Correct API name * Modify requirements.txt * Change domain * Build jekyll before checking broken links * Remove comment * Enforce style * Add assertion error message * . * Add test to check for US-specific spellings * Update requirements * Update .gitignore * Add model installation * Remove enchant --------- Co-authored-by: Tovmas <tharrison748@gmail.com>
- Loading branch information
1 parent
35cf483
commit 88b97f5
Showing
82 changed files
with
518 additions
and
125 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import os | ||
import re | ||
|
||
|
||
def walk_directory(directory, exclude_dirs=None, exclude_files=None): | ||
exclude_dirs = exclude_dirs or [] | ||
exclude_files = exclude_files or [] | ||
|
||
for root, dirs, files in os.walk(directory): | ||
if all(exclude_dir not in root for exclude_dir in exclude_dirs): | ||
dirs[:] = [d for d in dirs if not d.startswith(('.', '_'))] | ||
for file in files: | ||
if file.endswith(".md") and file not in exclude_files: | ||
yield os.path.join(root, file) | ||
|
||
def preprocess_article_names(file_paths): | ||
return [os.path.splitext(os.path.basename(file_path))[0].replace('-', ' ') for file_path in file_paths] | ||
|
||
def create_unlinked_word_pattern(article_names): | ||
return re.compile(r'\b(?:' + '|'.join(map(re.escape, article_names)) + r')\b', re.IGNORECASE) | ||
|
||
def check_file(file_path, unlinked_word_pattern): | ||
suggestions = set() | ||
file_name = file_path.split('/')[-1] | ||
|
||
with open(file_path, 'r', encoding='utf-8') as f: | ||
f.seek(0) | ||
next(f) | ||
is_frontmatter = True | ||
in_html_block = False | ||
|
||
content = f.read() | ||
for line in content.splitlines(): | ||
|
||
# Return if autogenerated | ||
if line.strip() == "autogenerated: true": | ||
return | ||
|
||
# Skip if frontmatter | ||
if is_frontmatter: | ||
if line.startswith('---'): | ||
is_frontmatter = False | ||
continue | ||
|
||
# Skip HTML | ||
if '<' in line and '>' in line: | ||
in_html_block = True | ||
|
||
if in_html_block: | ||
continue | ||
|
||
_line = line.lower() | ||
|
||
# Remove existing Markdown links and Liquid tags | ||
_line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line) | ||
|
||
# Search for suggestions | ||
for match in unlinked_word_pattern.finditer(_line): | ||
article_name = match.group() | ||
suggestion_key = (article_name, file_path) | ||
# If it's a link to our own file, skip. | ||
if file_name.lower() == f'{article_name.replace(" ", "-")}.md': | ||
continue | ||
# Skip the file if the word has already been linked | ||
if f'[{article_name}](/' in content or f'[{article_name}s](/' in content: | ||
break | ||
# If we already have this suggestion, skip. | ||
if suggestion_key in suggestions: | ||
continue | ||
suggestions.add(suggestion_key) | ||
|
||
print(f'"{article_name}" in {file_path.split("/")[-1]}') | ||
print(f'\t at the line: {line}') | ||
print(f'\t can be linked: [{article_name}](/{article_name.replace(" ", "-")}) \n') | ||
print(30 * '-') | ||
input("Press ENTER for more...") # Input is used to view each | ||
print(30 * '-') | ||
|
||
|
||
|
||
EXCLUDE_DIRS = ['events', 'vendor'] # Events can be removed as soon as it becomes autogenerated | ||
EXCLUDE_FILES = ['README.md', 'CHANGELOG.md', 'events'] | ||
DIR = ('../..') | ||
|
||
def main(): | ||
|
||
article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES)) | ||
article_names = preprocess_article_names(article_paths) | ||
unlinked_word_pattern = create_unlinked_word_pattern(article_names) | ||
|
||
for article_path in article_paths: | ||
check_file(article_path, unlinked_word_pattern) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
beautifulsoup4==4.11.1 | ||
bs4==0.0.1 | ||
charset-normalizer==2.1.1 | ||
html5lib==1.1 | ||
lxml==4.9.3 | ||
pluggy==1.3.0 | ||
pyenchant==3.2.2 | ||
pytest==7.4.3 | ||
pytest-asyncio==0.23.1 | ||
pytest-pyodide==0.55.1 | ||
python-dateutil==2.8.2 | ||
PyYAML==6.0.1 | ||
requests==2.31.0 | ||
six==1.16.0 | ||
soupsieve==2.5 | ||
typing_extensions==4.8.0 | ||
Unidecode==1.3.7 | ||
urllib3==2.1.0 | ||
zipp==1.0.0 | ||
spacy==3.7.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import concurrent.futures | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import pytest | ||
|
||
|
||
class TestSitemapRequests: | ||
""" Class for checking 404 pages """ | ||
|
||
# Fixture to create a session for each test method | ||
@pytest.fixture | ||
def session(self): | ||
with requests.Session() as session: | ||
yield session | ||
|
||
# Helper function to fetch a URL with retries | ||
def fetch_url(self, session, url): | ||
max_retries = 3 | ||
for attempt in range(max_retries): | ||
try: | ||
response = session.get(url) | ||
response.raise_for_status() | ||
return url, response.status_code | ||
except requests.exceptions.RequestException as e: | ||
if attempt < max_retries - 1: | ||
print(f"Retrying {url} (Attempt {attempt + 1})") | ||
else: | ||
print(f"Error for URL {url}: {e}") | ||
return url, 0 | ||
|
||
# Test method for sitemap requests | ||
def test_sitemap_requests(self, session): | ||
base_url = 'https://machinetranslate.org' | ||
sitemap_url = base_url + '/sitemap' | ||
|
||
# Timeout for HTTP requests | ||
timeout = 20 | ||
|
||
# Fetch the sitemap | ||
response = session.get(sitemap_url, timeout=timeout) | ||
assert response.status_code == 200 | ||
|
||
# Parse the sitemap using BeautifulSoup | ||
soup = BeautifulSoup(response.text, 'xml') | ||
url_elements = soup.find_all('loc') | ||
|
||
# Extract URLs from the sitemap | ||
urls = [url_element.text for url_element in url_elements] | ||
|
||
with concurrent.futures.ThreadPoolExecutor() as executor: | ||
futures = [executor.submit(self.fetch_url, session, url) for url in urls] | ||
|
||
# Collect URLs with non-200 status codes | ||
not_found_urls = [] | ||
for future in concurrent.futures.as_completed(futures): | ||
url, status_code = future.result() | ||
if status_code != 200: | ||
not_found_urls.append(url) | ||
|
||
# Assert that there are no URLs with non-200 status codes | ||
assert len(not_found_urls) == 0, f'Not found URLs: {not_found_urls}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import re | ||
|
||
import enchant | ||
import spacy | ||
|
||
from .find_missing_links import walk_directory | ||
|
||
|
||
# Load the English language model for Spacy | ||
NLP = spacy.load("en_core_web_sm") | ||
|
||
# Check if a token in a Spacy document is a proper noun | ||
def is_proper_noun_in_context(line, word): | ||
doc = NLP(line) | ||
for token in doc: | ||
if token.text == word and token.pos_ == 'PROPN': | ||
return True | ||
return False | ||
|
||
def is_camel_case(word): | ||
return any(c.isupper() for c in word[1:]) | ||
|
||
def check_spellings(file_path): | ||
# Create dictionaries for US and UK English | ||
us = enchant.Dict("en_US") | ||
uk = enchant.Dict('en_GB') | ||
|
||
with open(file_path, 'r', encoding='utf-8') as f: | ||
# Skip first line | ||
f.seek(0) | ||
next(f) | ||
is_frontmatter = True | ||
in_html_block = False | ||
|
||
for line in f: | ||
# Skip if frontmatter | ||
if is_frontmatter: | ||
if line.startswith('---'): | ||
is_frontmatter = False | ||
continue | ||
# Skip HTML | ||
if '<' in line and '>' in line: | ||
in_html_block = True | ||
|
||
if in_html_block: | ||
continue | ||
|
||
_line = line | ||
# Remove existing Markdown links and Liquid tags | ||
_line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line) | ||
|
||
for word in _line.split(): | ||
# Extract only the letters | ||
raw_word = re.sub(r'^[^a-zA-Z\'-]*|[^a-zA-Z\'-]*$', '', word) | ||
|
||
if raw_word: | ||
# Skip words that have the same spelling in both US and UK English or are wrong for both | ||
if ((us.check(raw_word) == uk.check(raw_word)) or (uk.check(raw_word) and not us.check(raw_word))): | ||
continue | ||
# Skip CamelCase and all-uppercase words | ||
if is_camel_case(raw_word) or raw_word.isupper(): | ||
continue | ||
# Check if the word is a proper noun in context | ||
assert is_proper_noun_in_context(_line, raw_word), \ | ||
f'US-specific spelling: "{raw_word}" in {file_path} \n\n\t at the line: {line} \n\n\tsuggestions: {uk.suggest(raw_word)} \n' | ||
|
||
|
||
EXCLUDE_DIRS = ['vendor'] | ||
EXCLUDE_FILES = ['README.md', 'CHANGELOG.md'] | ||
DIR = ('../../') | ||
|
||
def test_main(): | ||
article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES)) | ||
for article_path in article_paths: | ||
check_spellings(article_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import os | ||
from bs4 import BeautifulSoup | ||
|
||
def test_unique_slugs(): | ||
|
||
# Get the directory of the current script (./github/tests) | ||
script_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
# Construct the path to the sitemap.xml file in the root directory | ||
sitemap_path = os.path.join(script_dir, '..', '..', '_site', 'sitemap.xml') | ||
|
||
# Read the local sitemap file | ||
with open(sitemap_path, 'r', encoding='utf-8') as file: | ||
soup = BeautifulSoup(file.read(), 'xml') | ||
|
||
url_elements = soup.find_all('loc') | ||
|
||
slugs = [] | ||
for url_element in url_elements: | ||
url_text = url_element.text.split('/')[-1] | ||
slug = url_text | ||
|
||
# Check duplicate/conflicting slugs | ||
assert slug not in slugs, f'Duplicate Slug {slug} for the URL {url_element.text}: {slug} or unnecessary trailing slash' | ||
slugs.append(slug) | ||
|
||
# Check path without 'files' prefix | ||
check_path = url_element.text.split('/')[-2] | ||
|
||
# Check that URL is one level deep | ||
if check_path != 'files': | ||
assert check_path == 'machinetranslate.org', f'{url_element.text}: Paths should be 1 level deep e.g URL/slug' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
name: CI | ||
|
||
on: | ||
push: | ||
pull_request: | ||
|
||
jobs: | ||
run-tests: | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
os: [ubuntu-latest] | ||
python-versions: | ||
- '3.10' | ||
- '3.11' | ||
name: Test | ||
runs-on: ${{ matrix.os }} | ||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Ruby | ||
uses: ruby/setup-ruby@v1 | ||
with: | ||
ruby-version: 3.2.0 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-versions }} | ||
- name: Install Jekyll and Bundler | ||
run: | | ||
gem install jekyll bundler | ||
bundle install | ||
env: | ||
JEKYLL_ENV: production | ||
|
||
- name: Install Python dependencies | ||
run: | | ||
pip install -r .github/tests/requirements.txt | ||
python -m spacy download en_core_web_sm | ||
- name: Build Jekyll site | ||
run: bundle exec jekyll build | ||
env: | ||
JEKYLL_ENV: development | ||
PAGES_REPO_NWO: ${{ github.repository }} | ||
|
||
- name: Check broken links to pages | ||
run: | | ||
bundle exec htmlproofer _site \ | ||
--enforce-https false \ | ||
--disable-external true \ | ||
--ignore-missing-alt true \ | ||
--allow-missing-href true \ | ||
--check-internal-hash true | ||
env: | ||
JEKYLL_ENV: development | ||
|
||
- name: Serve Jekyll site | ||
run: bundle exec jekyll serve --detach | ||
env: | ||
JEKYLL_ENV: production | ||
PAGES_REPO_NWO: ${{ github.repository }} | ||
|
||
- name: Update pytest | ||
run: pip install --upgrade pytest pytest-asyncio | ||
|
||
- name: Run tests | ||
run: pytest .github/tests | ||
|
||
- name: Stop Jekyll server | ||
run: pkill -f 'bundle exec jekyll serve' || true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.