Add CI (#582)

* Add test for checking 404 urls * Add test to check duplicate slugs * Fix Wikipedia link for languages we only have a code * Fix hyperlinks * Add CI tests * Add supported version of sass-embedded * Add sass-embedded plugin * Downgrade sass-embedded plugin version * Remove sass-embedded gem * Change ruby version to 3.2.0 * Change html-proofer version * Change Omniscien id * Update htmlproofer * Change Omniscien id * Update .gitignore * Run generate.py * Change tests order * Change file paths * Add GitHub Metadata * Add repo name * Serve Jekyll in the background * Update requirements * Remove sleep * Update requirements * Update requirements * Update requirements. * update requirements * update requirements * update requirements * update requirements * Add update pytest run * Fix broken links * Fix json syntax and translation hub id * Run generate.py * Change Google id * Run generate.py * Change validation * Add comments * Change request logic with local lookup * Add link to GitHub Metadata issue solution * Slice codes list * Fix bug * Change phrase-mtqe to phrase-tms * Correct API name * Modify requirements.txt * Change domain * Build jekyll before checking broken links * Remove comment * Enforce style * Add assertion error message * . * Add test to check for US-specific spellings * Update requirements * Update .gitignore * Add model installation * Remove enchant --------- Co-authored-by: Tovmas <tharrison748@gmail.com>
machinetranslate · Jan 4, 2024 · 88b97f5 · 88b97f5
1 parent 35cf483
commit 88b97f5
Show file tree

Hide file tree

Showing 82 changed files with 518 additions and 125 deletions.
diff --git a/.github/tests/__init__.py b/.github/tests/__init__.py
diff --git a/.github/tests/find_missing_links.py b/.github/tests/find_missing_links.py
@@ -0,0 +1,95 @@
+import os
+import re
+
+
+def walk_directory(directory, exclude_dirs=None, exclude_files=None):
+    exclude_dirs = exclude_dirs or []
+    exclude_files = exclude_files or []
+
+    for root, dirs, files in os.walk(directory):
+        if all(exclude_dir not in root for exclude_dir in exclude_dirs):
+            dirs[:] = [d for d in dirs if not d.startswith(('.', '_'))]
+            for file in files:
+                if file.endswith(".md") and file not in exclude_files:
+                    yield os.path.join(root, file)
+
+def preprocess_article_names(file_paths):
+    return [os.path.splitext(os.path.basename(file_path))[0].replace('-', ' ') for file_path in file_paths]
+
+def create_unlinked_word_pattern(article_names):
+    return re.compile(r'\b(?:' + '|'.join(map(re.escape, article_names)) + r')\b', re.IGNORECASE)
+
+def check_file(file_path, unlinked_word_pattern):
+    suggestions = set()
+    file_name = file_path.split('/')[-1]
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        f.seek(0)
+        next(f)
+        is_frontmatter = True
+        in_html_block = False
+
+        content = f.read()
+        for line in content.splitlines():
+
+            # Return if autogenerated
+            if line.strip() == "autogenerated: true":
+                return
+
+            # Skip if frontmatter
+            if is_frontmatter:
+                if line.startswith('---'):
+                    is_frontmatter = False
+                continue
+
+            # Skip HTML
+            if '<' in line and '>' in line:
+                in_html_block = True
+
+            if in_html_block:
+                continue
+
+            _line = line.lower()
+
+            # Remove existing Markdown links and Liquid tags
+            _line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line)
+
+            # Search for suggestions
+            for match in unlinked_word_pattern.finditer(_line):
+                article_name = match.group()
+                suggestion_key = (article_name, file_path)
+                # If it's a link to our own file, skip.
+                if file_name.lower() == f'{article_name.replace(" ", "-")}.md':
+                    continue
+                # Skip the file if the word has already been linked
+                if f'[{article_name}](/' in content or f'[{article_name}s](/' in content:
+                    break
+                # If we already have this suggestion, skip.
+                if suggestion_key in suggestions:
+                    continue
+                suggestions.add(suggestion_key)
+
+                print(f'"{article_name}" in {file_path.split("/")[-1]}')
+                print(f'\t at the line: {line}')
+                print(f'\t can be linked: [{article_name}](/{article_name.replace(" ", "-")}) \n')
+                print(30 * '-')
+                input("Press ENTER for more...") # Input is used to view each 
+                print(30 * '-')
+
+
+
+EXCLUDE_DIRS = ['events', 'vendor'] # Events can be removed as soon as it becomes autogenerated
+EXCLUDE_FILES = ['README.md', 'CHANGELOG.md', 'events']
+DIR = ('../..')
+
+def main():
+
+    article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES))
+    article_names = preprocess_article_names(article_paths)
+    unlinked_word_pattern = create_unlinked_word_pattern(article_names)
+
+    for article_path in article_paths:
+        check_file(article_path, unlinked_word_pattern)
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/tests/requirements.txt b/.github/tests/requirements.txt
@@ -0,0 +1,20 @@
+beautifulsoup4==4.11.1
+bs4==0.0.1
+charset-normalizer==2.1.1
+html5lib==1.1
+lxml==4.9.3
+pluggy==1.3.0
+pyenchant==3.2.2
+pytest==7.4.3
+pytest-asyncio==0.23.1
+pytest-pyodide==0.55.1
+python-dateutil==2.8.2
+PyYAML==6.0.1
+requests==2.31.0
+six==1.16.0
+soupsieve==2.5
+typing_extensions==4.8.0
+Unidecode==1.3.7
+urllib3==2.1.0
+zipp==1.0.0
+spacy==3.7.2
diff --git a/.github/tests/test_check_urls.py b/.github/tests/test_check_urls.py
@@ -0,0 +1,61 @@
+import concurrent.futures
+import requests
+from bs4 import BeautifulSoup
+import pytest
+
+
+class TestSitemapRequests:
+    """ Class for checking 404 pages """
+
+    # Fixture to create a session for each test method
+    @pytest.fixture
+    def session(self):
+        with requests.Session() as session:
+            yield session
+
+    # Helper function to fetch a URL with retries
+    def fetch_url(self, session, url):
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                response = session.get(url)
+                response.raise_for_status()
+                return url, response.status_code
+            except requests.exceptions.RequestException as e:
+                if attempt < max_retries - 1:
+                    print(f"Retrying {url} (Attempt {attempt + 1})")
+                else:
+                    print(f"Error for URL {url}: {e}")
+                    return url, 0
+
+    # Test method for sitemap requests
+    def test_sitemap_requests(self, session):
+        base_url = 'https://machinetranslate.org'
+        sitemap_url = base_url + '/sitemap'
+
+        # Timeout for HTTP requests
+        timeout = 20
+
+        # Fetch the sitemap
+        response = session.get(sitemap_url, timeout=timeout)
+        assert response.status_code == 200
+
+        # Parse the sitemap using BeautifulSoup
+        soup = BeautifulSoup(response.text, 'xml')
+        url_elements = soup.find_all('loc')
+
+        # Extract URLs from the sitemap
+        urls = [url_element.text for url_element in url_elements]
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(self.fetch_url, session, url) for url in urls]
+
+        # Collect URLs with non-200 status codes
+        not_found_urls = []
+        for future in concurrent.futures.as_completed(futures):
+            url, status_code = future.result()
+            if status_code != 200:
+                not_found_urls.append(url)
+
+        # Assert that there are no URLs with non-200 status codes
+        assert len(not_found_urls) == 0, f'Not found URLs: {not_found_urls}'
diff --git a/.github/tests/test_enforce_style.py b/.github/tests/test_enforce_style.py
@@ -0,0 +1,75 @@
+import re
+
+import enchant
+import spacy
+
+from .find_missing_links import walk_directory
+
+
+# Load the English language model for Spacy
+NLP = spacy.load("en_core_web_sm")
+
+# Check if a token in a Spacy document is a proper noun
+def is_proper_noun_in_context(line, word):
+    doc = NLP(line)
+    for token in doc:
+        if token.text == word and token.pos_ == 'PROPN':
+            return True
+    return False
+
+def is_camel_case(word):
+    return any(c.isupper() for c in word[1:])
+
+def check_spellings(file_path):
+    # Create dictionaries for US and UK English
+    us = enchant.Dict("en_US")
+    uk = enchant.Dict('en_GB')
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        # Skip first line
+        f.seek(0)
+        next(f)
+        is_frontmatter = True
+        in_html_block = False
+
+        for line in f:
+            # Skip if frontmatter
+            if is_frontmatter:
+                if line.startswith('---'):
+                    is_frontmatter = False
+                continue
+            # Skip HTML
+            if '<' in line and '>' in line:
+                in_html_block = True
+
+            if in_html_block:
+                continue
+
+            _line = line
+            # Remove existing Markdown links and Liquid tags
+            _line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line)
+
+            for word in _line.split():
+                # Extract only the letters
+                raw_word = re.sub(r'^[^a-zA-Z\'-]*|[^a-zA-Z\'-]*$', '', word)
+
+                if raw_word:
+                    # Skip words that have the same spelling in both US and UK English or are wrong for both
+                    if ((us.check(raw_word) == uk.check(raw_word)) or (uk.check(raw_word) and not us.check(raw_word))):
+                        continue
+                    # Skip CamelCase and all-uppercase words
+                    if is_camel_case(raw_word) or raw_word.isupper():
+                        continue
+                    # Check if the word is a proper noun in context
+                    assert is_proper_noun_in_context(_line, raw_word), \
+                        f'US-specific spelling: "{raw_word}" in {file_path} \n\n\t at the line: {line} \n\n\tsuggestions: {uk.suggest(raw_word)} \n'
+
+
+EXCLUDE_DIRS = ['vendor']
+EXCLUDE_FILES = ['README.md', 'CHANGELOG.md']
+DIR = ('../../')
+
+def test_main():
+    article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES))
+    for article_path in article_paths:
+        check_spellings(article_path)
diff --git a/.github/tests/test_slugs.py b/.github/tests/test_slugs.py
@@ -0,0 +1,32 @@
+import os
+from bs4 import BeautifulSoup
+
+def test_unique_slugs():
+
+    # Get the directory of the current script (./github/tests)
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Construct the path to the sitemap.xml file in the root directory
+    sitemap_path = os.path.join(script_dir, '..', '..', '_site', 'sitemap.xml')
+
+    # Read the local sitemap file
+    with open(sitemap_path, 'r', encoding='utf-8') as file:
+        soup = BeautifulSoup(file.read(), 'xml')
+
+    url_elements = soup.find_all('loc')
+
+    slugs = []
+    for url_element in url_elements:
+        url_text = url_element.text.split('/')[-1]
+        slug = url_text
+
+        # Check duplicate/conflicting slugs
+        assert slug not in slugs, f'Duplicate Slug {slug} for the URL {url_element.text}: {slug} or unnecessary trailing slash'
+        slugs.append(slug)
+
+        # Check path without 'files' prefix
+        check_path = url_element.text.split('/')[-2]
+
+        # Check that URL is one level deep
+        if check_path != 'files':
+            assert check_path == 'machinetranslate.org', f'{url_element.text}: Paths should be 1 level deep e.g URL/slug'
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,73 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+    run-tests:
+        strategy:
+            fail-fast: false
+            matrix:
+                os: [ubuntu-latest]
+                python-versions:
+                    - '3.10'
+                    - '3.11'
+        name: Test
+        runs-on: ${{ matrix.os }} 
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v3
+
+            - name: Set up Ruby
+              uses: ruby/setup-ruby@v1
+              with:
+                ruby-version: 3.2.0 
+
+            - name: Set up Python
+              uses: actions/setup-python@v4
+              with:
+                python-version: ${{ matrix.python-versions }}
+            - name: Install Jekyll and Bundler
+              run: |
+                gem install jekyll bundler
+                bundle install
+              env:
+                JEKYLL_ENV: production
+
+            - name: Install Python dependencies
+              run: |
+                pip install -r .github/tests/requirements.txt
+                python -m spacy download en_core_web_sm
+
+            - name: Build Jekyll site
+              run: bundle exec jekyll build
+              env:
+                JEKYLL_ENV: development
+                PAGES_REPO_NWO: ${{ github.repository }}
+
+            - name: Check broken links to pages
+              run: |
+                bundle exec htmlproofer _site \
+                --enforce-https false \
+                --disable-external true \
+                --ignore-missing-alt true \
+                --allow-missing-href true \
+                --check-internal-hash true
+              env:
+                JEKYLL_ENV: development
+
+            - name: Serve Jekyll site
+              run: bundle exec jekyll serve --detach
+              env:
+                JEKYLL_ENV: production
+                PAGES_REPO_NWO: ${{ github.repository }}
+
+            - name: Update pytest
+              run: pip install --upgrade pytest pytest-asyncio
+
+            - name: Run tests
+              run: pytest .github/tests
+
+            - name: Stop Jekyll server
+              run: pkill -f 'bundle exec jekyll serve' || true            
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,8 @@ _site
 .jekyll-cache
 .jekyll-metadata
 vendor
-.ruby-version
+.ruby-version
+/venv
+.pytest_cache
+__pycache__
+.github/tests/.pytest_cache
diff --git a/Gemfile b/Gemfile
@@ -21,7 +21,7 @@ group :jekyll_plugins do
   gem 'jekyll-redirect-from'
   # gem 'jekyll-target-blank'
   gem 'jekyll-seo-tag'
-  gem 'html-proofer'
+  gem 'html-proofer', '~> 4.3.0'
   gem 'jekyll-include-cache'
 end
 gem 'webrick'