Skip to content

Commit

Permalink
Add CI (#582)
Browse files Browse the repository at this point in the history
* Add test for checking 404 urls

* Add test to check duplicate slugs

* Fix Wikipedia link for languages we only have a code

* Fix hyperlinks

* Add CI tests

* Add supported version of sass-embedded

* Add sass-embedded plugin

* Downgrade sass-embedded plugin version

* Remove sass-embedded gem

* Change ruby version to 3.2.0

* Change html-proofer version

* Change Omniscien id

* Update htmlproofer

* Change Omniscien id

* Update .gitignore

* Run generate.py

* Change tests order

* Change file paths

* Add GitHub Metadata

* Add repo name

* Serve Jekyll in the background

* Update requirements

* Remove sleep

* Update requirements

* Update requirements

* Update requirements.

* update requirements

* update requirements

* update requirements

* update requirements

* Add update pytest run

* Fix broken links

* Fix json syntax and translation hub id

* Run generate.py

* Change Google id

* Run generate.py

* Change validation

* Add comments

* Change request logic with local lookup

* Add link to GitHub Metadata issue solution

* Slice codes list

* Fix bug

* Change phrase-mtqe to phrase-tms

* Correct API name

* Modify requirements.txt

* Change domain

* Build jekyll before checking broken links

* Remove comment

* Enforce style

* Add assertion error message

* .

* Add test to check for US-specific spellings

* Update requirements

* Update .gitignore

* Add model installation

* Remove enchant

---------

Co-authored-by: Tovmas <tharrison748@gmail.com>
  • Loading branch information
tovmasharrison and Tovmas authored Jan 4, 2024
1 parent 35cf483 commit 88b97f5
Show file tree
Hide file tree
Showing 82 changed files with 518 additions and 125 deletions.
Empty file added .github/tests/__init__.py
Empty file.
95 changes: 95 additions & 0 deletions .github/tests/find_missing_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os
import re


def walk_directory(directory, exclude_dirs=None, exclude_files=None):
exclude_dirs = exclude_dirs or []
exclude_files = exclude_files or []

for root, dirs, files in os.walk(directory):
if all(exclude_dir not in root for exclude_dir in exclude_dirs):
dirs[:] = [d for d in dirs if not d.startswith(('.', '_'))]
for file in files:
if file.endswith(".md") and file not in exclude_files:
yield os.path.join(root, file)

def preprocess_article_names(file_paths):
return [os.path.splitext(os.path.basename(file_path))[0].replace('-', ' ') for file_path in file_paths]

def create_unlinked_word_pattern(article_names):
return re.compile(r'\b(?:' + '|'.join(map(re.escape, article_names)) + r')\b', re.IGNORECASE)

def check_file(file_path, unlinked_word_pattern):
suggestions = set()
file_name = file_path.split('/')[-1]

with open(file_path, 'r', encoding='utf-8') as f:
f.seek(0)
next(f)
is_frontmatter = True
in_html_block = False

content = f.read()
for line in content.splitlines():

# Return if autogenerated
if line.strip() == "autogenerated: true":
return

# Skip if frontmatter
if is_frontmatter:
if line.startswith('---'):
is_frontmatter = False
continue

# Skip HTML
if '<' in line and '>' in line:
in_html_block = True

if in_html_block:
continue

_line = line.lower()

# Remove existing Markdown links and Liquid tags
_line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line)

# Search for suggestions
for match in unlinked_word_pattern.finditer(_line):
article_name = match.group()
suggestion_key = (article_name, file_path)
# If it's a link to our own file, skip.
if file_name.lower() == f'{article_name.replace(" ", "-")}.md':
continue
# Skip the file if the word has already been linked
if f'[{article_name}](/' in content or f'[{article_name}s](/' in content:
break
# If we already have this suggestion, skip.
if suggestion_key in suggestions:
continue
suggestions.add(suggestion_key)

print(f'"{article_name}" in {file_path.split("/")[-1]}')
print(f'\t at the line: {line}')
print(f'\t can be linked: [{article_name}](/{article_name.replace(" ", "-")}) \n')
print(30 * '-')
input("Press ENTER for more...") # Input is used to view each
print(30 * '-')



EXCLUDE_DIRS = ['events', 'vendor'] # Events can be removed as soon as it becomes autogenerated
EXCLUDE_FILES = ['README.md', 'CHANGELOG.md', 'events']
DIR = ('../..')

def main():

article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES))
article_names = preprocess_article_names(article_paths)
unlinked_word_pattern = create_unlinked_word_pattern(article_names)

for article_path in article_paths:
check_file(article_path, unlinked_word_pattern)

if __name__ == "__main__":
main()
20 changes: 20 additions & 0 deletions .github/tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
beautifulsoup4==4.11.1
bs4==0.0.1
charset-normalizer==2.1.1
html5lib==1.1
lxml==4.9.3
pluggy==1.3.0
pyenchant==3.2.2
pytest==7.4.3
pytest-asyncio==0.23.1
pytest-pyodide==0.55.1
python-dateutil==2.8.2
PyYAML==6.0.1
requests==2.31.0
six==1.16.0
soupsieve==2.5
typing_extensions==4.8.0
Unidecode==1.3.7
urllib3==2.1.0
zipp==1.0.0
spacy==3.7.2
61 changes: 61 additions & 0 deletions .github/tests/test_check_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import pytest


class TestSitemapRequests:
""" Class for checking 404 pages """

# Fixture to create a session for each test method
@pytest.fixture
def session(self):
with requests.Session() as session:
yield session

# Helper function to fetch a URL with retries
def fetch_url(self, session, url):
max_retries = 3
for attempt in range(max_retries):
try:
response = session.get(url)
response.raise_for_status()
return url, response.status_code
except requests.exceptions.RequestException as e:
if attempt < max_retries - 1:
print(f"Retrying {url} (Attempt {attempt + 1})")
else:
print(f"Error for URL {url}: {e}")
return url, 0

# Test method for sitemap requests
def test_sitemap_requests(self, session):
base_url = 'https://machinetranslate.org'
sitemap_url = base_url + '/sitemap'

# Timeout for HTTP requests
timeout = 20

# Fetch the sitemap
response = session.get(sitemap_url, timeout=timeout)
assert response.status_code == 200

# Parse the sitemap using BeautifulSoup
soup = BeautifulSoup(response.text, 'xml')
url_elements = soup.find_all('loc')

# Extract URLs from the sitemap
urls = [url_element.text for url_element in url_elements]

with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(self.fetch_url, session, url) for url in urls]

# Collect URLs with non-200 status codes
not_found_urls = []
for future in concurrent.futures.as_completed(futures):
url, status_code = future.result()
if status_code != 200:
not_found_urls.append(url)

# Assert that there are no URLs with non-200 status codes
assert len(not_found_urls) == 0, f'Not found URLs: {not_found_urls}'
75 changes: 75 additions & 0 deletions .github/tests/test_enforce_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import re

import enchant
import spacy

from .find_missing_links import walk_directory


# Load the English language model for Spacy
NLP = spacy.load("en_core_web_sm")

# Check if a token in a Spacy document is a proper noun
def is_proper_noun_in_context(line, word):
doc = NLP(line)
for token in doc:
if token.text == word and token.pos_ == 'PROPN':
return True
return False

def is_camel_case(word):
return any(c.isupper() for c in word[1:])

def check_spellings(file_path):
# Create dictionaries for US and UK English
us = enchant.Dict("en_US")
uk = enchant.Dict('en_GB')

with open(file_path, 'r', encoding='utf-8') as f:
# Skip first line
f.seek(0)
next(f)
is_frontmatter = True
in_html_block = False

for line in f:
# Skip if frontmatter
if is_frontmatter:
if line.startswith('---'):
is_frontmatter = False
continue
# Skip HTML
if '<' in line and '>' in line:
in_html_block = True

if in_html_block:
continue

_line = line
# Remove existing Markdown links and Liquid tags
_line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line)

for word in _line.split():
# Extract only the letters
raw_word = re.sub(r'^[^a-zA-Z\'-]*|[^a-zA-Z\'-]*$', '', word)

if raw_word:
# Skip words that have the same spelling in both US and UK English or are wrong for both
if ((us.check(raw_word) == uk.check(raw_word)) or (uk.check(raw_word) and not us.check(raw_word))):
continue
# Skip CamelCase and all-uppercase words
if is_camel_case(raw_word) or raw_word.isupper():
continue
# Check if the word is a proper noun in context
assert is_proper_noun_in_context(_line, raw_word), \
f'US-specific spelling: "{raw_word}" in {file_path} \n\n\t at the line: {line} \n\n\tsuggestions: {uk.suggest(raw_word)} \n'


EXCLUDE_DIRS = ['vendor']
EXCLUDE_FILES = ['README.md', 'CHANGELOG.md']
DIR = ('../../')

def test_main():
article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES))
for article_path in article_paths:
check_spellings(article_path)
32 changes: 32 additions & 0 deletions .github/tests/test_slugs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
from bs4 import BeautifulSoup

def test_unique_slugs():

# Get the directory of the current script (./github/tests)
script_dir = os.path.dirname(os.path.abspath(__file__))

# Construct the path to the sitemap.xml file in the root directory
sitemap_path = os.path.join(script_dir, '..', '..', '_site', 'sitemap.xml')

# Read the local sitemap file
with open(sitemap_path, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file.read(), 'xml')

url_elements = soup.find_all('loc')

slugs = []
for url_element in url_elements:
url_text = url_element.text.split('/')[-1]
slug = url_text

# Check duplicate/conflicting slugs
assert slug not in slugs, f'Duplicate Slug {slug} for the URL {url_element.text}: {slug} or unnecessary trailing slash'
slugs.append(slug)

# Check path without 'files' prefix
check_path = url_element.text.split('/')[-2]

# Check that URL is one level deep
if check_path != 'files':
assert check_path == 'machinetranslate.org', f'{url_element.text}: Paths should be 1 level deep e.g URL/slug'
73 changes: 73 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
name: CI

on:
push:
pull_request:

jobs:
run-tests:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-versions:
- '3.10'
- '3.11'
name: Test
runs-on: ${{ matrix.os }}
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: 3.2.0

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-versions }}
- name: Install Jekyll and Bundler
run: |
gem install jekyll bundler
bundle install
env:
JEKYLL_ENV: production

- name: Install Python dependencies
run: |
pip install -r .github/tests/requirements.txt
python -m spacy download en_core_web_sm
- name: Build Jekyll site
run: bundle exec jekyll build
env:
JEKYLL_ENV: development
PAGES_REPO_NWO: ${{ github.repository }}

- name: Check broken links to pages
run: |
bundle exec htmlproofer _site \
--enforce-https false \
--disable-external true \
--ignore-missing-alt true \
--allow-missing-href true \
--check-internal-hash true
env:
JEKYLL_ENV: development

- name: Serve Jekyll site
run: bundle exec jekyll serve --detach
env:
JEKYLL_ENV: production
PAGES_REPO_NWO: ${{ github.repository }}

- name: Update pytest
run: pip install --upgrade pytest pytest-asyncio

- name: Run tests
run: pytest .github/tests

- name: Stop Jekyll server
run: pkill -f 'bundle exec jekyll serve' || true
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ _site
.jekyll-cache
.jekyll-metadata
vendor
.ruby-version
.ruby-version
/venv
.pytest_cache
__pycache__
.github/tests/.pytest_cache
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ group :jekyll_plugins do
gem 'jekyll-redirect-from'
# gem 'jekyll-target-blank'
gem 'jekyll-seo-tag'
gem 'html-proofer'
gem 'html-proofer', '~> 4.3.0'
gem 'jekyll-include-cache'
end
gem 'webrick'
Expand Down
Loading

0 comments on commit 88b97f5

Please sign in to comment.