From 88b97f55094723e1f9ad19a758c3af16e40184b1 Mon Sep 17 00:00:00 2001 From: tovmasharrison <94985882+tovmasharrison@users.noreply.github.com> Date: Fri, 5 Jan 2024 02:01:31 +0400 Subject: [PATCH] Add CI (#582) * Add test for checking 404 urls * Add test to check duplicate slugs * Fix Wikipedia link for languages we only have a code * Fix hyperlinks * Add CI tests * Add supported version of sass-embedded * Add sass-embedded plugin * Downgrade sass-embedded plugin version * Remove sass-embedded gem * Change ruby version to 3.2.0 * Change html-proofer version * Change Omniscien id * Update htmlproofer * Change Omniscien id * Update .gitignore * Run generate.py * Change tests order * Change file paths * Add GitHub Metadata * Add repo name * Serve Jekyll in the background * Update requirements * Remove sleep * Update requirements * Update requirements * Update requirements. * update requirements * update requirements * update requirements * update requirements * Add update pytest run * Fix broken links * Fix json syntax and translation hub id * Run generate.py * Change Google id * Run generate.py * Change validation * Add comments * Change request logic with local lookup * Add link to GitHub Metadata issue solution * Slice codes list * Fix bug * Change phrase-mtqe to phrase-tms * Correct API name * Modify requirements.txt * Change domain * Build jekyll before checking broken links * Remove comment * Enforce style * Add assertion error message * . * Add test to check for US-specific spellings * Update requirements * Update .gitignore * Add model installation * Remove enchant --------- Co-authored-by: Tovmas --- .github/tests/__init__.py | 0 .github/tests/find_missing_links.py | 95 +++++++++++++++++++ .github/tests/requirements.txt | 20 ++++ .github/tests/test_check_urls.py | 61 ++++++++++++ .github/tests/test_enforce_style.py | 75 +++++++++++++++ .github/tests/test_slugs.py | 32 +++++++ .github/workflows/ci.yaml | 73 ++++++++++++++ .gitignore | 6 +- Gemfile | 2 +- Gemfile.lock | 28 +++--- _config.yml | 4 + _data/quality_estimation.json | 10 +- _layouts/language.html | 2 +- applications/live-chat.md | 6 +- building-and-research/metrics/metrics.md | 4 +- concepts/context.md | 6 ++ concepts/sentence-embeddings.md | 7 ++ concepts/tokenisation.md | 2 +- contributing/roadmap.md | 18 ++-- contributing/style.md | 2 +- events/aamt-seminar-1.md | 2 +- events/aamt2019.md | 2 +- events/ai-and-language-technologies.md | 2 +- events/ai-opportunities-and-risk.md | 2 +- events/events.md | 2 +- events/wmt09.md | 2 +- integrations/translation-hub.md | 5 +- languages/albanian.md | 2 +- languages/alemannic.md | 2 +- languages/arabic.md | 4 +- languages/bengali.md | 2 +- languages/bulgarian.md | 2 +- languages/burmese.md | 2 +- languages/chinese.md | 4 +- languages/czech.md | 2 +- languages/danish.md | 2 +- languages/dutch.md | 2 +- languages/english.md | 4 +- languages/estonian.md | 2 +- languages/finnish.md | 2 +- languages/french.md | 4 +- languages/german.md | 4 +- languages/greek.md | 2 +- languages/gujarati.md | 2 +- languages/hebrew.md | 2 +- languages/hindi.md | 4 +- languages/hungarian.md | 2 +- languages/indonesian.md | 2 +- languages/irish.md | 2 +- languages/italian.md | 4 +- languages/japanese.md | 4 +- languages/khmer.md | 2 +- languages/korean.md | 2 +- languages/latvian.md | 2 +- languages/lithuanian.md | 2 +- languages/malay.md | 2 +- languages/maltese.md | 2 +- languages/nepali.md | 2 +- languages/norwegian.md | 2 +- languages/persian.md | 2 +- languages/polish.md | 2 +- languages/portuguese.md | 4 +- languages/romanian.md | 2 +- languages/russian.md | 12 +-- languages/slovak.md | 2 +- languages/slovenian.md | 2 +- languages/spanish.md | 4 +- languages/swahili.md | 2 +- languages/swedish.md | 2 +- languages/tagalog.md | 2 +- languages/thai.md | 2 +- languages/turkish.md | 15 +-- languages/urdu.md | 2 +- languages/vietnamese.md | 2 +- more/associations/iamt.md | 2 +- more/associations/mt-summit.md | 7 ++ more/people/alon-lavie.md | 2 +- .../google-translation-hub-mtqp.md | 11 ++- .../omniscien-confidence-scores.md | 2 +- quality-estimation/quality-estimation.md | 6 +- resources/education.md | 4 +- resources/reports.md | 2 +- 82 files changed, 518 insertions(+), 125 deletions(-) create mode 100644 .github/tests/__init__.py create mode 100644 .github/tests/find_missing_links.py create mode 100644 .github/tests/requirements.txt create mode 100644 .github/tests/test_check_urls.py create mode 100644 .github/tests/test_enforce_style.py create mode 100644 .github/tests/test_slugs.py create mode 100644 .github/workflows/ci.yaml create mode 100644 concepts/context.md create mode 100644 concepts/sentence-embeddings.md create mode 100644 more/associations/mt-summit.md diff --git a/.github/tests/__init__.py b/.github/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/.github/tests/find_missing_links.py b/.github/tests/find_missing_links.py new file mode 100644 index 000000000..ee4656194 --- /dev/null +++ b/.github/tests/find_missing_links.py @@ -0,0 +1,95 @@ +import os +import re + + +def walk_directory(directory, exclude_dirs=None, exclude_files=None): + exclude_dirs = exclude_dirs or [] + exclude_files = exclude_files or [] + + for root, dirs, files in os.walk(directory): + if all(exclude_dir not in root for exclude_dir in exclude_dirs): + dirs[:] = [d for d in dirs if not d.startswith(('.', '_'))] + for file in files: + if file.endswith(".md") and file not in exclude_files: + yield os.path.join(root, file) + +def preprocess_article_names(file_paths): + return [os.path.splitext(os.path.basename(file_path))[0].replace('-', ' ') for file_path in file_paths] + +def create_unlinked_word_pattern(article_names): + return re.compile(r'\b(?:' + '|'.join(map(re.escape, article_names)) + r')\b', re.IGNORECASE) + +def check_file(file_path, unlinked_word_pattern): + suggestions = set() + file_name = file_path.split('/')[-1] + + with open(file_path, 'r', encoding='utf-8') as f: + f.seek(0) + next(f) + is_frontmatter = True + in_html_block = False + + content = f.read() + for line in content.splitlines(): + + # Return if autogenerated + if line.strip() == "autogenerated: true": + return + + # Skip if frontmatter + if is_frontmatter: + if line.startswith('---'): + is_frontmatter = False + continue + + # Skip HTML + if '<' in line and '>' in line: + in_html_block = True + + if in_html_block: + continue + + _line = line.lower() + + # Remove existing Markdown links and Liquid tags + _line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line) + + # Search for suggestions + for match in unlinked_word_pattern.finditer(_line): + article_name = match.group() + suggestion_key = (article_name, file_path) + # If it's a link to our own file, skip. + if file_name.lower() == f'{article_name.replace(" ", "-")}.md': + continue + # Skip the file if the word has already been linked + if f'[{article_name}](/' in content or f'[{article_name}s](/' in content: + break + # If we already have this suggestion, skip. + if suggestion_key in suggestions: + continue + suggestions.add(suggestion_key) + + print(f'"{article_name}" in {file_path.split("/")[-1]}') + print(f'\t at the line: {line}') + print(f'\t can be linked: [{article_name}](/{article_name.replace(" ", "-")}) \n') + print(30 * '-') + input("Press ENTER for more...") # Input is used to view each + print(30 * '-') + + + +EXCLUDE_DIRS = ['events', 'vendor'] # Events can be removed as soon as it becomes autogenerated +EXCLUDE_FILES = ['README.md', 'CHANGELOG.md', 'events'] +DIR = ('../..') + +def main(): + + article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES)) + article_names = preprocess_article_names(article_paths) + unlinked_word_pattern = create_unlinked_word_pattern(article_names) + + for article_path in article_paths: + check_file(article_path, unlinked_word_pattern) + +if __name__ == "__main__": + main() diff --git a/.github/tests/requirements.txt b/.github/tests/requirements.txt new file mode 100644 index 000000000..6336afeaa --- /dev/null +++ b/.github/tests/requirements.txt @@ -0,0 +1,20 @@ +beautifulsoup4==4.11.1 +bs4==0.0.1 +charset-normalizer==2.1.1 +html5lib==1.1 +lxml==4.9.3 +pluggy==1.3.0 +pyenchant==3.2.2 +pytest==7.4.3 +pytest-asyncio==0.23.1 +pytest-pyodide==0.55.1 +python-dateutil==2.8.2 +PyYAML==6.0.1 +requests==2.31.0 +six==1.16.0 +soupsieve==2.5 +typing_extensions==4.8.0 +Unidecode==1.3.7 +urllib3==2.1.0 +zipp==1.0.0 +spacy==3.7.2 diff --git a/.github/tests/test_check_urls.py b/.github/tests/test_check_urls.py new file mode 100644 index 000000000..312ed8532 --- /dev/null +++ b/.github/tests/test_check_urls.py @@ -0,0 +1,61 @@ +import concurrent.futures +import requests +from bs4 import BeautifulSoup +import pytest + + +class TestSitemapRequests: + """ Class for checking 404 pages """ + + # Fixture to create a session for each test method + @pytest.fixture + def session(self): + with requests.Session() as session: + yield session + + # Helper function to fetch a URL with retries + def fetch_url(self, session, url): + max_retries = 3 + for attempt in range(max_retries): + try: + response = session.get(url) + response.raise_for_status() + return url, response.status_code + except requests.exceptions.RequestException as e: + if attempt < max_retries - 1: + print(f"Retrying {url} (Attempt {attempt + 1})") + else: + print(f"Error for URL {url}: {e}") + return url, 0 + + # Test method for sitemap requests + def test_sitemap_requests(self, session): + base_url = 'https://machinetranslate.org' + sitemap_url = base_url + '/sitemap' + + # Timeout for HTTP requests + timeout = 20 + + # Fetch the sitemap + response = session.get(sitemap_url, timeout=timeout) + assert response.status_code == 200 + + # Parse the sitemap using BeautifulSoup + soup = BeautifulSoup(response.text, 'xml') + url_elements = soup.find_all('loc') + + # Extract URLs from the sitemap + urls = [url_element.text for url_element in url_elements] + + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(self.fetch_url, session, url) for url in urls] + + # Collect URLs with non-200 status codes + not_found_urls = [] + for future in concurrent.futures.as_completed(futures): + url, status_code = future.result() + if status_code != 200: + not_found_urls.append(url) + + # Assert that there are no URLs with non-200 status codes + assert len(not_found_urls) == 0, f'Not found URLs: {not_found_urls}' diff --git a/.github/tests/test_enforce_style.py b/.github/tests/test_enforce_style.py new file mode 100644 index 000000000..ac7a03873 --- /dev/null +++ b/.github/tests/test_enforce_style.py @@ -0,0 +1,75 @@ +import re + +import enchant +import spacy + +from .find_missing_links import walk_directory + + +# Load the English language model for Spacy +NLP = spacy.load("en_core_web_sm") + +# Check if a token in a Spacy document is a proper noun +def is_proper_noun_in_context(line, word): + doc = NLP(line) + for token in doc: + if token.text == word and token.pos_ == 'PROPN': + return True + return False + +def is_camel_case(word): + return any(c.isupper() for c in word[1:]) + +def check_spellings(file_path): + # Create dictionaries for US and UK English + us = enchant.Dict("en_US") + uk = enchant.Dict('en_GB') + + with open(file_path, 'r', encoding='utf-8') as f: + # Skip first line + f.seek(0) + next(f) + is_frontmatter = True + in_html_block = False + + for line in f: + # Skip if frontmatter + if is_frontmatter: + if line.startswith('---'): + is_frontmatter = False + continue + # Skip HTML + if '<' in line and '>' in line: + in_html_block = True + + if in_html_block: + continue + + _line = line + # Remove existing Markdown links and Liquid tags + _line = re.sub(r'\[(.*?)\]\([^)]*?\)|\{\{[^}]*\}\}|{%[^\n]*%}', ' ... ', _line) + + for word in _line.split(): + # Extract only the letters + raw_word = re.sub(r'^[^a-zA-Z\'-]*|[^a-zA-Z\'-]*$', '', word) + + if raw_word: + # Skip words that have the same spelling in both US and UK English or are wrong for both + if ((us.check(raw_word) == uk.check(raw_word)) or (uk.check(raw_word) and not us.check(raw_word))): + continue + # Skip CamelCase and all-uppercase words + if is_camel_case(raw_word) or raw_word.isupper(): + continue + # Check if the word is a proper noun in context + assert is_proper_noun_in_context(_line, raw_word), \ + f'US-specific spelling: "{raw_word}" in {file_path} \n\n\t at the line: {line} \n\n\tsuggestions: {uk.suggest(raw_word)} \n' + + +EXCLUDE_DIRS = ['vendor'] +EXCLUDE_FILES = ['README.md', 'CHANGELOG.md'] +DIR = ('../../') + +def test_main(): + article_paths = list(walk_directory(DIR, exclude_dirs=EXCLUDE_DIRS, exclude_files=EXCLUDE_FILES)) + for article_path in article_paths: + check_spellings(article_path) diff --git a/.github/tests/test_slugs.py b/.github/tests/test_slugs.py new file mode 100644 index 000000000..756ecd156 --- /dev/null +++ b/.github/tests/test_slugs.py @@ -0,0 +1,32 @@ +import os +from bs4 import BeautifulSoup + +def test_unique_slugs(): + + # Get the directory of the current script (./github/tests) + script_dir = os.path.dirname(os.path.abspath(__file__)) + + # Construct the path to the sitemap.xml file in the root directory + sitemap_path = os.path.join(script_dir, '..', '..', '_site', 'sitemap.xml') + + # Read the local sitemap file + with open(sitemap_path, 'r', encoding='utf-8') as file: + soup = BeautifulSoup(file.read(), 'xml') + + url_elements = soup.find_all('loc') + + slugs = [] + for url_element in url_elements: + url_text = url_element.text.split('/')[-1] + slug = url_text + + # Check duplicate/conflicting slugs + assert slug not in slugs, f'Duplicate Slug {slug} for the URL {url_element.text}: {slug} or unnecessary trailing slash' + slugs.append(slug) + + # Check path without 'files' prefix + check_path = url_element.text.split('/')[-2] + + # Check that URL is one level deep + if check_path != 'files': + assert check_path == 'machinetranslate.org', f'{url_element.text}: Paths should be 1 level deep e.g URL/slug' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 000000000..cb15c432b --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,73 @@ +name: CI + +on: + push: + pull_request: + +jobs: + run-tests: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-versions: + - '3.10' + - '3.11' + name: Test + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.2.0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-versions }} + - name: Install Jekyll and Bundler + run: | + gem install jekyll bundler + bundle install + env: + JEKYLL_ENV: production + + - name: Install Python dependencies + run: | + pip install -r .github/tests/requirements.txt + python -m spacy download en_core_web_sm + + - name: Build Jekyll site + run: bundle exec jekyll build + env: + JEKYLL_ENV: development + PAGES_REPO_NWO: ${{ github.repository }} + + - name: Check broken links to pages + run: | + bundle exec htmlproofer _site \ + --enforce-https false \ + --disable-external true \ + --ignore-missing-alt true \ + --allow-missing-href true \ + --check-internal-hash true + env: + JEKYLL_ENV: development + + - name: Serve Jekyll site + run: bundle exec jekyll serve --detach + env: + JEKYLL_ENV: production + PAGES_REPO_NWO: ${{ github.repository }} + + - name: Update pytest + run: pip install --upgrade pytest pytest-asyncio + + - name: Run tests + run: pytest .github/tests + + - name: Stop Jekyll server + run: pkill -f 'bundle exec jekyll serve' || true \ No newline at end of file diff --git a/.gitignore b/.gitignore index 64b6757c1..c810154fa 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,8 @@ _site .jekyll-cache .jekyll-metadata vendor -.ruby-version \ No newline at end of file +.ruby-version +/venv +.pytest_cache +__pycache__ +.github/tests/.pytest_cache \ No newline at end of file diff --git a/Gemfile b/Gemfile index 1729343c7..cd29cf17b 100644 --- a/Gemfile +++ b/Gemfile @@ -21,7 +21,7 @@ group :jekyll_plugins do gem 'jekyll-redirect-from' # gem 'jekyll-target-blank' gem 'jekyll-seo-tag' - gem 'html-proofer' + gem 'html-proofer', '~> 4.3.0' gem 'jekyll-include-cache' end gem 'webrick' diff --git a/Gemfile.lock b/Gemfile.lock index 7d082a356..3d21924fa 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -9,6 +9,7 @@ GEM zeitwerk (~> 2.2, >= 2.2.2) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) + base64 (0.2.0) coffee-script (2.4.1) coffee-script-source execjs @@ -24,12 +25,13 @@ GEM ethon (0.16.0) ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.8.1) - faraday (2.7.10) + execjs (2.9.1) + faraday (2.7.12) + base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) faraday-net_http (3.0.2) - ffi (1.15.5) + ffi (1.16.3) forwardable-extended (2.6.0) gemoji (3.0.1) github-pages (228) @@ -86,7 +88,7 @@ GEM html-pipeline (2.14.3) activesupport (>= 2) nokogiri (>= 1.4) - html-proofer (4.4.3) + html-proofer (4.3.2) addressable (~> 2.3) mercenary (~> 0.3) nokogiri (~> 1.13) @@ -215,13 +217,13 @@ GEM rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.8.4) + mini_portile2 (2.8.5) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.19.0) - nokogiri (1.15.4) + minitest (5.20.0) + nokogiri (1.15.5) mini_portile2 (~> 2.8.2) racc (~> 1.4) octokit (4.25.1) @@ -231,7 +233,7 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (4.0.7) - racc (1.7.1) + racc (1.7.3) rainbow (3.1.1) rb-fsevent (0.11.2) rb-inotify (0.10.1) @@ -254,7 +256,7 @@ GEM terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) - typhoeus (1.4.0) + typhoeus (1.4.1) ethon (>= 0.9.0) tzinfo (1.2.11) thread_safe (~> 0.1) @@ -262,18 +264,18 @@ GEM tzinfo (>= 1.0.0) unf (0.1.4) unf_ext - unf_ext (0.0.8.2) + unf_ext (0.0.9.1) unicode-display_width (1.8.0) webrick (1.8.1) yell (2.2.2) - zeitwerk (2.6.11) + zeitwerk (2.6.12) PLATFORMS ruby DEPENDENCIES github-pages - html-proofer + html-proofer (~> 4.3.0) jekyll-feed (~> 0.12) jekyll-include-cache jekyll-redirect-from @@ -284,4 +286,4 @@ DEPENDENCIES webrick BUNDLED WITH - 2.4.17 + 2.4.22 diff --git a/_config.yml b/_config.yml index c3ed87aa5..ffa488d40 100644 --- a/_config.yml +++ b/_config.yml @@ -241,8 +241,12 @@ social: lsi: false +# Enable incremental builds by default incremental: true +# Add GitHub Metadata (https://github.com/github/pages-gem/issues/399) +github: [metadata] + # Build settings # Note that .files are excluded by default. exclude: diff --git a/_data/quality_estimation.json b/_data/quality_estimation.json index 4c3d8c7d0..dac3a739b 100644 --- a/_data/quality_estimation.json +++ b/_data/quality_estimation.json @@ -1644,9 +1644,9 @@ ] }, { - "id": "omniscien-qe", + "id": "omniscien-confidence-scores", "name": "Omniscien Confidence Scores", - "company": "Omniscien-Technologies", + "company": "Omniscien Technologies", "tagline": "Translation confidence scoring and quality estimates", "languages": [ [ @@ -3946,7 +3946,7 @@ "es" ] ], - "only_compatible_mt_api": "kantanmt", + "only_compatible_mt_api": "omniscien", "only_compatible_tms": null, "customisation": true, "urls": [ @@ -4079,7 +4079,7 @@ ] }, { - "id": "translationhub", + "id": "google-translation-hub-mtqp", "name": "Google Translation Hub MTQP", "company": "Google", "tagline": "Machine translation quality prediction scores", @@ -4166,7 +4166,7 @@ ] ], "only_compatible_mt_api": "google", - "only_compatible_tms": "translationhub", + "only_compatible_tms": "translation-hub", "customisation": false, "urls": [ "https://cloud.google.com/translation-hub/docs/user-post-edit#segment-details", diff --git a/_layouts/language.html b/_layouts/language.html index 737faed4c..29c5718d3 100644 --- a/_layouts/language.html +++ b/_layouts/language.html @@ -20,7 +20,7 @@

You can add it to Machine Translate by getting the name, language family, script and typology to languages.json and submitting a pull request. - You may be able to find this information in its Wikipedia article. + You may be able to find this information in its Wikipedia article.

{% endunless %} diff --git a/applications/live-chat.md b/applications/live-chat.md index 5e45029bb..247fdbde4 100644 --- a/applications/live-chat.md +++ b/applications/live-chat.md @@ -23,9 +23,9 @@ Many commercial chat applications that have incorporated machine translation for ## Companies -* [Unbabel](/companies/#unbabel) -* [Language I/O](/companies/#language-io) -* [KantanMT](/companies/#kantanmt) +* [Unbabel](/companies#unbabel) +* [Language I/O](/companies#language-io) +* [KantanMT](/companies#kantanmt) ## See also diff --git a/building-and-research/metrics/metrics.md b/building-and-research/metrics/metrics.md index 2ec715cff..37499c408 100644 --- a/building-and-research/metrics/metrics.md +++ b/building-and-research/metrics/metrics.md @@ -42,7 +42,7 @@ The scores generally do not correlate well with human evaluation scores when tra ### Machine learning-based metrics -Machine learning-based metrics use [sentence embeddings](/sentence-embeddings) to calculate the difference between the generated target sentence and the reference translation, or even between the target senternce and the source sentence. +Machine learning-based metrics use [sentence embeddings](/sentence-embeddings) to calculate the difference between the generated target sentence and the reference translation, or even between the target sentence and the source sentence. Examples:/ - [COMET](/comet) @@ -67,7 +67,7 @@ Human evaluation is the gold standard. - [Adecuacy and fluency judgement](/human-evaluation-metrics#adequacy-and-fluency-judgement) - [Relative ranking](/human-evaluation-metrics#relative-ranking) - [Constituent ranking](/human-evaluation-metrics#constituent-ranking) -- [Yes or no constituent judgement](/human-evaluation-metrics#yes-and-no-constituent-judgement) +- [Yes or no constituent judgement](/human-evaluation-metrics#yes-or-no-constituent-judgement) - [Direct assessment](/human-evaluation-metrics#direct-assessment) But human evaluation is slow, expensive and subjective. diff --git a/concepts/context.md b/concepts/context.md new file mode 100644 index 000000000..c01c37ca0 --- /dev/null +++ b/concepts/context.md @@ -0,0 +1,6 @@ +--- +parent: Customisation +layout: coming_soon +title: Context +description: +--- \ No newline at end of file diff --git a/concepts/sentence-embeddings.md b/concepts/sentence-embeddings.md new file mode 100644 index 000000000..9c846e17e --- /dev/null +++ b/concepts/sentence-embeddings.md @@ -0,0 +1,7 @@ +--- +parent: Customisation +layout: coming_soon +title: Sentence embeddings + +description: +--- \ No newline at end of file diff --git a/concepts/tokenisation.md b/concepts/tokenisation.md index d1454a0a2..90227c0b1 100644 --- a/concepts/tokenisation.md +++ b/concepts/tokenisation.md @@ -5,7 +5,7 @@ description: Splitting a string into a sequence of tokens --- **Tokenisation** is the process of splitting a string into a sequence of substrings called tokens. -A [token](/concepts/token) is typically an atomic unit of meaning, such as a word or a punctuation character. +A [token](/token) is typically an atomic unit of meaning, such as a word or a punctuation character. Example: diff --git a/contributing/roadmap.md b/contributing/roadmap.md index 8350d4b2e..3b0c48486 100644 --- a/contributing/roadmap.md +++ b/contributing/roadmap.md @@ -17,7 +17,7 @@ community_search_exclude: true - ~~[Zurich Machine Translation Meetup](/zurich-9)~~ - ~~[MTM](/mtm2019)~~ - ~~[AAMT](/aamt2021)~~ -- ~~[ASLTRM](/asltrm2021)~~ +- ~~[ASLTRW](/asltrw2021)~~ - ~~[AT4SSL](/at4ssl2021)~~ - ~~[IWSLT](/iwslt2022)~~ - ~~[LoResMT](/loresmt2022)~~ @@ -33,7 +33,7 @@ community_search_exclude: true - [History \[#65\]](https://github.com/machinetranslate/machinetranslate.org/issues/65) - FAQ -### - ~~[Products](/products)~~ +### ~~[Products]~~ - ~~[Data confidentiality](/data-confidentiality)~~ - Features @@ -77,8 +77,8 @@ community_search_exclude: true - ~~[Gaming](/gaming)~~ - ~~[Social networks](/social-networks)~~ - ~~[Translation and localisation](/translation-and-localisation)~~ - - ~~[Post-editing](/workflows/post-editing)~~ - - ~~[Hybrid translation](/workflows/hybrid-translation)~~ + - ~~[Post-editing](/post-editing)~~ + - ~~[Hybrid translation](/hybrid-translation)~~ - [Human-in-the-loop \[#76\]](https://github.com/machinetranslate/machinetranslate.org/issues/76) - ~~[Multilingual search](/multilingual-search)~~ - ~~[Translation for SEO](/seo)~~ @@ -102,7 +102,7 @@ community_search_exclude: true - [Open datasets \[#89\]](https://github.com/machinetranslate/machinetranslate.org/issues/89) - [Back-translation (and back-copying) \[#81\]](https://github.com/machinetranslate/machinetranslate.org/issues/81) - [Crawling \[#72\]](https://github.com/machinetranslate/machinetranslate.org/issues/72) -- ~~[Filtering](/customisation/filtering)~~ +- ~~[Filtering](/filtering)~~ - ~~[Tokenisation \[#73\]](https://github.com/machinetranslate/machinetranslate.org/issues/73)~~ - [Quality evaluation \[#86\]](https://github.com/machinetranslate/machinetranslate.org/issues/86) - ~~[Metrics \[#314\]](https://github.com/machinetranslate/machinetranslate.org/issues/314)~~ @@ -120,11 +120,11 @@ community_search_exclude: true ### Concepts - [Corpus \[#109\]](https://github.com/machinetranslate/machinetranslate.org/issues/109) - - ~~[String](../concepts/string)~~ + - ~~[String](/string)~~ - ~~[Token \[#111\]](https://github.com/machinetranslate/machinetranslate.org/issues/111)~~ - ~~[n-gram \[#108\]](https://github.com/machinetranslate/machinetranslate.org/issues/108)~~ - ~~[Vector \[#112\]](https://github.com/machinetranslate/machinetranslate.org/issues/112)~~ - - ~~[Language model](/concepts/language-model)~~ + - ~~[Language model](/language-model)~~ - ~~[Sentence splitting \[#174\]](https://github.com/machinetranslate/machinetranslate.org/issues/174)~~ - ~~[Word embeddings \[#173\]](https://github.com/machinetranslate/machinetranslate.org/issues/173)~~ - ~~[Lexicon \[#64\]](https://github.com/machinetranslate/machinetranslate.org/issues/64)~~ @@ -137,14 +137,14 @@ community_search_exclude: true - ~~[Libraries / Frameworks](/libraries-and-frameworks)~~ - ~~[Publications](/publications)~~ - [Early Years of Machine Translation \[205\]](https://github.com/machinetranslate/machinetranslate.org/issues/205) -- ~~[Tutorials](/integration/tutorials)~~ +- ~~[Tutorials](/tutorials)~~ ### More - People - ~~[John Hutchins](/john-hutchins)~~ - ~~[Georges Artsrouni](/georges-artsrouni)~~ - - ~~[Petr Troianskii](/petr-troianskii)~~ + - ~~[Petr Troianskii](/petr-troyanskii)~~ - ~~[Warren Weaver](/warren-weaver)~~ - [Salim Roukos \[#168\]](https://github.com/machinetranslate/machinetranslate.org/issues/168) - [Hermann Ney \[#103\]](https://github.com/machinetranslate/machinetranslate.org/issues/103) diff --git a/contributing/style.md b/contributing/style.md index 58e29d8a7..f92a4a1ff 100644 --- a/contributing/style.md +++ b/contributing/style.md @@ -307,7 +307,7 @@ She researches [quality estimation](/quality/quality-estimation.md) with Kevin J For a specific **section** of an article, use the **fragment identifier**. ``` -She researches quality estimation with Kevin Johnson at [Microsoft](industry/companies.md#microsoft). +She researches quality estimation with Kevin Johnson at [Microsoft](/companies#microsoft). ``` Avoid trailing slashes at the end of links. diff --git a/events/aamt-seminar-1.md b/events/aamt-seminar-1.md index 46477743c..1d42f3422 100644 --- a/events/aamt-seminar-1.md +++ b/events/aamt-seminar-1.md @@ -23,7 +23,7 @@ seo: url: https://aamt.info/ --- -The first Asian-Pacific Association for Machine Translation (**[AAMT](/associations/aamt.md) 2020**) Seminar took place online on 28 September, 2022. +The first Asian-Pacific Association for Machine Translation (**[AAMT](/aamt) 2020**) Seminar took place online on 28 September, 2022. [aamt.info/event/seminar/20220928](https://www.aamt.info/event/seminar/20220928) diff --git a/events/aamt2019.md b/events/aamt2019.md index cc685de08..c54b6bf1a 100644 --- a/events/aamt2019.md +++ b/events/aamt2019.md @@ -23,7 +23,7 @@ seo: url: https://aamt.info/ --- -The first Asian-Pacific Association for Machine Translation conference (**[AAMT](/associations/aamt.md) 2020**) took place on 19 November, 2019, in Chiyoda, Tokyo. +The first Asian-Pacific Association for Machine Translation conference (**[AAMT](/aamt) 2020**) took place on 19 November, 2019, in Chiyoda, Tokyo. [aamt.info/aamttokyo2019](https://aamt.info/aamttokyo2019/) diff --git a/events/ai-and-language-technologies.md b/events/ai-and-language-technologies.md index 7100321a5..f92083ef8 100644 --- a/events/ai-and-language-technologies.md +++ b/events/ai-and-language-technologies.md @@ -27,7 +27,7 @@ seo: The **Artificial Intelligence and Language Technologies** will take place online on 21 November, 2022. The event is organised by [Omniscien Technologies](/companies#omniscien-technologies). -The speakers will be [Philipp Koehn](/people/philipp-koehn) and Dion Wiggins. +The speakers will be [Philipp Koehn](/philipp-koehn) and Dion Wiggins. > We will look at trends in Neural Machine Translation, Speech Recognition, Text to Speech, Natural Language Processing, Artificial Intelligence, Machine Learning, Data Mining, Text Analytics, and more. diff --git a/events/ai-opportunities-and-risk.md b/events/ai-opportunities-and-risk.md index af444b3a2..eccac026e 100644 --- a/events/ai-opportunities-and-risk.md +++ b/events/ai-opportunities-and-risk.md @@ -30,7 +30,7 @@ It was organised by [Omniscien](/companies#omniscien-technologies). ### Speakers -- Professor [Philipp Koehn](/people/philipp-koehn), Chief Scientist at Omniscien and Professor at Johns Hopkins University +- Professor [Philipp Koehn](/philipp-koehn), Chief Scientist at Omniscien and Professor at Johns Hopkins University - Dion Wiggins, Chief Technology Officer at Omniscien - Dr Joseph Sweeney, Industry Analyst, Intelligent Business Research Services (IBRS) diff --git a/events/events.md b/events/events.md index abc74a9cf..c67ffecac 100644 --- a/events/events.md +++ b/events/events.md @@ -71,7 +71,7 @@ seo: | 16 October | [LoResMT 2022](/loresmt2022) | Gyeongju, Republic of Korea | | 14 October | [Debunking 'No language left behind', 'Human parity' and other machine translation myths](/debunking-nllb) | online | | 12 - 17 October | [WAT 2022](/wat2022) | Gyeongju, Republic of Korea | -| 11 - 13 October | [Massively Multilingual Conference & Expo](/massively-multilingual-conference-expo) | San Jose, California | +| 11 - 13 October | [Massively Multilingual Conference & Expo](/taus2022) | San Jose, California | | 6 October | [Literary Machine Translation as a Human-Machine Dialectic](/human-machine-dialectic) | Liège, Belgium | | 30 September | [MUMTTT 2022](/mumttt2022) | Malaga, Spain | | 28 September | [AAMT Seminar](/aamt-seminar-1) | online | diff --git a/events/wmt09.md b/events/wmt09.md index 5d754cea5..1246c85c4 100644 --- a/events/wmt09.md +++ b/events/wmt09.md @@ -109,7 +109,7 @@ Full results of the shared tasks: [*Findings of the 2009 Workshop on Statistical ### News translation -The results were determined with a [relative ranking](../building-and-research/metrics/human-evaluation-metrics#relative-ranking), the `≥ others` (“greater than or equal to others”) score. +The results were determined with a [relative ranking](/human-evaluation-metrics#relative-ranking), the `≥ others` (“greater than or equal to others”) score. It measures how often a system was judged to be better than or equal to any other system. diff --git a/integrations/translation-hub.md b/integrations/translation-hub.md index 7818459e1..944d19431 100644 --- a/integrations/translation-hub.md +++ b/integrations/translation-hub.md @@ -19,7 +19,10 @@ api_integrations: name: Google Translate fuzzy_repair: false open-source: false -quality_estimation_integrations: [] +quality_estimation_integrations: +- slug: google-translation-hub-mtqp + custom: false + name: Google Translation Hub MTQP seo: name: Machine translation API integrations in Google Cloud Translation Hub type: Product diff --git a/languages/albanian.md b/languages/albanian.md index c2bd9a3b3..4284e8e3d 100644 --- a/languages/albanian.md +++ b/languages/albanian.md @@ -84,4 +84,4 @@ seo: type: Language --- -The language code `als` is the ISO 639-3 code for Tosk Albanian, but it is used by Wikipedia for [Alemannic](/languages/alemannic.md). +The language code `als` is the ISO 639-3 code for Tosk Albanian, but it is used by Wikipedia for [Alemannic](/alemannic). diff --git a/languages/alemannic.md b/languages/alemannic.md index a3dc9f80d..f4b0b9550 100644 --- a/languages/alemannic.md +++ b/languages/alemannic.md @@ -48,4 +48,4 @@ seo: type: Language --- -Wikipedia uses the language code `als` for Alemannic, but it is actually the ISO 639-3 code for [Tosk Albanian](/languages/albanian.md). +Wikipedia uses the language code `als` for Alemannic, but it is actually the ISO 639-3 code for [Tosk Albanian](/albanian). diff --git a/languages/arabic.md b/languages/arabic.md index 71699ab27..e59fc3e19 100644 --- a/languages/arabic.md +++ b/languages/arabic.md @@ -136,13 +136,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/bengali.md b/languages/bengali.md index 70fc683cb..d2ba40100 100644 --- a/languages/bengali.md +++ b/languages/bengali.md @@ -73,7 +73,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/bulgarian.md b/languages/bulgarian.md index 17e339052..7747992db 100644 --- a/languages/bulgarian.md +++ b/languages/bulgarian.md @@ -94,7 +94,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/burmese.md b/languages/burmese.md index 007e94a64..30b2282c1 100644 --- a/languages/burmese.md +++ b/languages/burmese.md @@ -62,7 +62,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: demt-estimate id: demt-estimate diff --git a/languages/chinese.md b/languages/chinese.md index fc755c8b6..de1a2e5c9 100644 --- a/languages/chinese.md +++ b/languages/chinese.md @@ -143,13 +143,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/czech.md b/languages/czech.md index 33ec2dca7..f5225e162 100644 --- a/languages/czech.md +++ b/languages/czech.md @@ -104,7 +104,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/danish.md b/languages/danish.md index 0b3ea31ca..051175901 100644 --- a/languages/danish.md +++ b/languages/danish.md @@ -106,7 +106,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/dutch.md b/languages/dutch.md index d4520b582..a1d1c2c87 100644 --- a/languages/dutch.md +++ b/languages/dutch.md @@ -127,7 +127,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/english.md b/languages/english.md index 7f19ec66c..061b59606 100644 --- a/languages/english.md +++ b/languages/english.md @@ -160,13 +160,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/estonian.md b/languages/estonian.md index 4541dd8ce..da0dd8bad 100644 --- a/languages/estonian.md +++ b/languages/estonian.md @@ -90,7 +90,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/finnish.md b/languages/finnish.md index cc95d696a..4c50ad563 100644 --- a/languages/finnish.md +++ b/languages/finnish.md @@ -104,7 +104,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/french.md b/languages/french.md index f7e7e62bf..6187465e5 100644 --- a/languages/french.md +++ b/languages/french.md @@ -145,13 +145,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/german.md b/languages/german.md index e784ca7d0..e4514d44c 100644 --- a/languages/german.md +++ b/languages/german.md @@ -137,13 +137,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/greek.md b/languages/greek.md index 9f62a0a9a..c2c0b81b0 100644 --- a/languages/greek.md +++ b/languages/greek.md @@ -95,7 +95,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/gujarati.md b/languages/gujarati.md index 707492b1d..2e50c0132 100644 --- a/languages/gujarati.md +++ b/languages/gujarati.md @@ -64,7 +64,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/hebrew.md b/languages/hebrew.md index 0f27bb352..47a2454f5 100644 --- a/languages/hebrew.md +++ b/languages/hebrew.md @@ -93,7 +93,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/hindi.md b/languages/hindi.md index 1d74beaaf..55232bb37 100644 --- a/languages/hindi.md +++ b/languages/hindi.md @@ -106,13 +106,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/hungarian.md b/languages/hungarian.md index a5ffbc3db..2bc232c7f 100644 --- a/languages/hungarian.md +++ b/languages/hungarian.md @@ -106,7 +106,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/indonesian.md b/languages/indonesian.md index 5b03985c7..d5b878ffc 100644 --- a/languages/indonesian.md +++ b/languages/indonesian.md @@ -105,7 +105,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/irish.md b/languages/irish.md index 4c0e2de17..b9ba33f44 100644 --- a/languages/irish.md +++ b/languages/irish.md @@ -70,7 +70,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/italian.md b/languages/italian.md index 31a167337..c642b85b5 100644 --- a/languages/italian.md +++ b/languages/italian.md @@ -127,13 +127,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/japanese.md b/languages/japanese.md index 0b23ad7ac..1d35b5fef 100644 --- a/languages/japanese.md +++ b/languages/japanese.md @@ -116,13 +116,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/khmer.md b/languages/khmer.md index 694b45896..1996e12eb 100644 --- a/languages/khmer.md +++ b/languages/khmer.md @@ -55,7 +55,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: demt-estimate id: demt-estimate diff --git a/languages/korean.md b/languages/korean.md index 41a8b3ae5..5c958e7b8 100644 --- a/languages/korean.md +++ b/languages/korean.md @@ -114,7 +114,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/latvian.md b/languages/latvian.md index 29960aa6b..3c5b9e360 100644 --- a/languages/latvian.md +++ b/languages/latvian.md @@ -90,7 +90,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/lithuanian.md b/languages/lithuanian.md index d0bc06ab7..8bcc70bc7 100644 --- a/languages/lithuanian.md +++ b/languages/lithuanian.md @@ -88,7 +88,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/malay.md b/languages/malay.md index 0d409e4ca..e1ce3cfb6 100644 --- a/languages/malay.md +++ b/languages/malay.md @@ -88,7 +88,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/maltese.md b/languages/maltese.md index 610576ca4..acf3eec0b 100644 --- a/languages/maltese.md +++ b/languages/maltese.md @@ -73,7 +73,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/nepali.md b/languages/nepali.md index 10708ac0d..24266f137 100644 --- a/languages/nepali.md +++ b/languages/nepali.md @@ -55,7 +55,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/norwegian.md b/languages/norwegian.md index 4507e01ab..e30ec731e 100644 --- a/languages/norwegian.md +++ b/languages/norwegian.md @@ -87,7 +87,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: demt-estimate id: demt-estimate diff --git a/languages/persian.md b/languages/persian.md index 030ce2388..400b45876 100644 --- a/languages/persian.md +++ b/languages/persian.md @@ -94,7 +94,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/polish.md b/languages/polish.md index 7aa573416..039d8c9d6 100644 --- a/languages/polish.md +++ b/languages/polish.md @@ -109,7 +109,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/portuguese.md b/languages/portuguese.md index da61460b4..2014ee6e8 100644 --- a/languages/portuguese.md +++ b/languages/portuguese.md @@ -128,13 +128,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/romanian.md b/languages/romanian.md index 0632fcca1..794dedbad 100644 --- a/languages/romanian.md +++ b/languages/romanian.md @@ -99,7 +99,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/russian.md b/languages/russian.md index 01d34a835..d53407800 100644 --- a/languages/russian.md +++ b/languages/russian.md @@ -132,13 +132,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe @@ -162,16 +162,16 @@ Among the best supported languages, it is notable for a few features: Russian has the best machine translation support among Slavic languages and among languages written in the Cyrillic alphabet. -[English](english.md)-Russian and Russian-English are the highest traffic language pairs for [Yandex Translate](/apis/yandex.md). +[English](/english)-Russian and Russian-English are the highest traffic language pairs for [Yandex Translate](/yandex). -Russian-[Chinese](chinese.md) and Chinese-Russian are also significant translation pairs. +Russian-[Chinese](/chinese) and Chinese-Russian are also significant translation pairs. ## History -In 1935, [Petr Troyanskii](/people/petr-troyanskii.md) filed USSR Patent 40995 on machine translation. +In 1935, [Petr Troyanskii](/petr-troyanskii) filed USSR Patent 40995 on machine translation. During the Cold War, English and Russian were the most important languages geopolitically, and much machine translation research and development was for defence and intelligence agencies. In the 2010s, while most providers used bridging or pivoting via English for most language pairs, Yandex Translate launched direct translation between Russian and many languages. -By 2020, Russian was supported by all the major translation application and API providers, including [Google Translate](/apis/google.md), Yandex Translate, [Microsoft Translator](/apis/microsoft.md), [Amazon Translate](/apis/amazon.md), [ModernMT](/apis/modernmt.md) and [DeepL](/apis/deepl.md). +By 2020, Russian was supported by all the major translation application and API providers, including [Google Translate](/google), Yandex Translate, [Microsoft Translator](/microsoft), [Amazon Translate](/amazon), [ModernMT](/modernmt) and [DeepL](/deepl). diff --git a/languages/slovak.md b/languages/slovak.md index f49d03ff9..b365d1ce9 100644 --- a/languages/slovak.md +++ b/languages/slovak.md @@ -95,7 +95,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/slovenian.md b/languages/slovenian.md index ca0d24098..955e156a6 100644 --- a/languages/slovenian.md +++ b/languages/slovenian.md @@ -92,7 +92,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/spanish.md b/languages/spanish.md index d9ba02835..094e5d9ee 100644 --- a/languages/spanish.md +++ b/languages/spanish.md @@ -158,13 +158,13 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: google-translation-hub-mtqp - id: translationhub + id: google-translation-hub-mtqp name: Google Translation Hub MTQP - slug: kantanqes id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/swahili.md b/languages/swahili.md index 0328cd608..1eef3ea39 100644 --- a/languages/swahili.md +++ b/languages/swahili.md @@ -71,7 +71,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: demt-estimate id: demt-estimate diff --git a/languages/swedish.md b/languages/swedish.md index 68ccd0353..cf8ebebb6 100644 --- a/languages/swedish.md +++ b/languages/swedish.md @@ -110,7 +110,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/tagalog.md b/languages/tagalog.md index 6cf59fcc5..4ca52888f 100644 --- a/languages/tagalog.md +++ b/languages/tagalog.md @@ -73,7 +73,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/thai.md b/languages/thai.md index b236e1354..014b8522b 100644 --- a/languages/thai.md +++ b/languages/thai.md @@ -101,7 +101,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/turkish.md b/languages/turkish.md index 9d4292e2e..98ca1cce6 100644 --- a/languages/turkish.md +++ b/languages/turkish.md @@ -111,7 +111,7 @@ supported_qe_apis: id: kantanqes name: KantanQES - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe @@ -134,14 +134,14 @@ Turkic languages are agglutinative and usually have vowel harmony. ## History In 1994, NATO supported two machine translation English to Turkish projects within the scope of the Science for Stability Program. -These projects were [rule-based machine translation](/approaches/rule-based-machine-translation.md) systems within specific domains: news captions and IBM manuals. +These projects were [rule-based machine translation](/rule-based-machine-translation) systems within specific domains: news captions and IBM manuals. In 2011, Apertium launched the Apertium Turkic work group in an attempt to create resources for Turkic languages rule-based machine translation and natural language processing. In 2019, Emel Alkım and Yalçın Çebi launched MT-Turk for Turkish, Kirghiz and Kazan Tatar. MT-Turk is a rule-based machine translation system that uses interlingual and transfer systems. -In 2020, [Turkic Interlingua](/community/communities.md#til) was created to develop language technology, including parallel corpora, in the most spoken Turkic languages. +In 2020, [Turkic Interlingua](/communities#til) was created to develop language technology, including parallel corpora, in the most spoken Turkic languages. In 2021, the Literary Machine Translation Project started at the Department of Translation and Interpreting Studies from the Boğaziçi University. @@ -151,7 +151,8 @@ In 2021, the Literary Machine Translation Project started at the Department of T The [Turkish National Corpus (TNC)](https://www.tnc.org.tr/) contains 50 million Turkish words. -The Turkish, Kurdish, and English [Bianet newspaper](https://opus.nlpl.eu/Bianet.php) parallel corpus contains 3,214 Turkish articles aligned with its Kurdish or English translations. +The Turkish, Kurdish, and English [Bianet newspaper](https://opus.nlpl.eu/B 'a' tag is missing a reference +ianet.php) parallel corpus contains 3,214 Turkish articles aligned with its Kurdish or English translations. The [TS Corpus](https://tscorpus.com/) project offers English-Turkish parallel databases for academic and research purposes. @@ -159,11 +160,11 @@ The [TDD](https://tdd.ai/) project serves a series of [corpora](https://corpus.t ## Support -In 2016, [Google Translate](/apis/google.md) included Turkish in the launch of Google Neural Machine Translation. +In 2016, [Google Translate](/google) included Turkish in the launch of Google Neural Machine Translation. -By 2022, Turkish was supported by all the major translation applications and API providers, including Google Translate, [Microsoft Translator](/apis/microsoft.md), [Yandex Translate](/apis/yandex.md), [Amazon Translate](/apis/amazon.md), and [Watson Language Translator](/apis/watson.md). +By 2022, Turkish was supported by all the major translation applications and API providers, including Google Translate, [Microsoft Translator](/microsoft), [Yandex Translate](/yandex), [Amazon Translate](/amazon), and [Watson Language Translator](/watson). By 2022, Yandex Translate had direct translation systems for Turkish-Russian and Turkish-German. Turkish is one of the top languages for Yandex Translate. -English to Turkish is one of the top language pairs for [Systran](/apis/systran.md). +English to Turkish is one of the top language pairs for [Systran](/systran). diff --git a/languages/urdu.md b/languages/urdu.md index 7df0c0f88..51bc02b1c 100644 --- a/languages/urdu.md +++ b/languages/urdu.md @@ -79,7 +79,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/languages/vietnamese.md b/languages/vietnamese.md index d67f140cf..b0071c4e4 100644 --- a/languages/vietnamese.md +++ b/languages/vietnamese.md @@ -95,7 +95,7 @@ supported_apis: name: Niutrans supported_qe_apis: - slug: omniscien-confidence-scores - id: omniscien-qe + id: omniscien-confidence-scores name: Omniscien Confidence Scores - slug: phrase-mtqe id: phrase-mtqe diff --git a/more/associations/iamt.md b/more/associations/iamt.md index 2ca4de7a4..5966263d0 100644 --- a/more/associations/iamt.md +++ b/more/associations/iamt.md @@ -30,7 +30,7 @@ The regional associations of IAMT hold the [MT Summit](/mt-summit). ## Executive Committee -The current executive committee will serve through MT Summit 2023. +The current executive committee will serve through [MT Summit 2023](/mtsummit2023). ### President Eiichiro Sumita, National Institute of Information and Communications Technology, Japan (AAMT) diff --git a/more/associations/mt-summit.md b/more/associations/mt-summit.md new file mode 100644 index 000000000..73086093c --- /dev/null +++ b/more/associations/mt-summit.md @@ -0,0 +1,7 @@ +--- +grand_parent: More +parent: Associations +layout: coming_soon +title: MT Summit +description: +--- diff --git a/more/people/alon-lavie.md b/more/people/alon-lavie.md index 933855ed3..1ae9275c0 100644 --- a/more/people/alon-lavie.md +++ b/more/people/alon-lavie.md @@ -39,7 +39,7 @@ In 2015, it was acquired by Amazon, and became Amazon Translate. In 2019, Lavie joined [Unbabel](/companies#unbabel) as Head of AI. -In 2023, Lavie joined [Phrase](/phrase) as VP of AI Research. +In 2023, Lavie joined [Phrase](/phrase-tms) as VP of AI Research. --- diff --git a/quality-estimation/google-translation-hub-mtqp.md b/quality-estimation/google-translation-hub-mtqp.md index 778009f27..e8f87c259 100644 --- a/quality-estimation/google-translation-hub-mtqp.md +++ b/quality-estimation/google-translation-hub-mtqp.md @@ -6,7 +6,7 @@ layout: quality_estimation title: Google Translation Hub MTQP description: The Google Translation Hub MTQP translation quality estimation API tagline: Machine translation quality prediction scores -id: translationhub +id: google-translation-hub-mtqp company: Google parent: Quality estimation urls: @@ -72,10 +72,13 @@ only_compatible_mt_api: - id: google name: Google Translate only_compatible_tms: -- id: translationhub - name: translationhub +- id: translation-hub + name: Google Cloud Translation Hub customisation: false -integrations: [] +integrations: +- slug: translation-hub + name: Google Cloud Translation Hub + custom: false seo: name: The Google Translation Hub MTQP translation quality estimation API type: Product diff --git a/quality-estimation/omniscien-confidence-scores.md b/quality-estimation/omniscien-confidence-scores.md index 89d43a4e8..33c42442f 100644 --- a/quality-estimation/omniscien-confidence-scores.md +++ b/quality-estimation/omniscien-confidence-scores.md @@ -6,7 +6,7 @@ layout: quality_estimation title: Omniscien Confidence Scores description: The Omniscien Confidence Scores translation quality estimation API tagline: Translation confidence scoring and quality estimates -id: omniscien-qe +id: omniscien-confidence-scores company: Omniscien Technologies parent: Quality estimation urls: diff --git a/quality-estimation/quality-estimation.md b/quality-estimation/quality-estimation.md index b166a9c90..bdc12d039 100644 --- a/quality-estimation/quality-estimation.md +++ b/quality-estimation/quality-estimation.md @@ -76,7 +76,7 @@ The first framework, QuEst, was released in 2013. | [OpenKiwi](https://github.com/Unbabel/OpenKiwi) | Unbabel | Deep learning | | [TransQuest](https://github.com/TharinduDR/TransQuest) | Tharindu Ranasinghe, University of Wolverhampton | Deep learning | -TransQuest also includes pretrained models. The models were pretrained with [WMT](/associations/wmt) data. +TransQuest also includes pretrained models. The models were pretrained with [WMT](/wmt) data. ### Providers @@ -108,10 +108,10 @@ There is a quality estimation integration or connector available for most transl | Product | Feature | Provider | | ---| --- | --- | | [translate5](/translate5) | [ModelFront quality prediction plug-in](https://www.modelfront.com/translate5) | [ModelFront](/modelfront) | -| [memoQ](/memoq) | [Quality estimates (AIQE)](https://www.memoq.com/mt-quality-estimates) | [ModelFront](/modelfront), [TAUS](/taus) [DeMT Estimate API](/demt-estimate-api) | +| [memoQ](/memoq) | [Quality estimates (AIQE)](https://www.memoq.com/mt-quality-estimates) | [ModelFront](/modelfront), [DeMT Estimate](/demt-estimate) | | [Crowdin](/crowdin) | [ModelFront quality prediction](https://store.crowdin.com/modelfront) | [ModelFront](/modelfront) | | [XTM](/xtm) | ModelFront XTM connector | [ModelFront](/modelfront) | KantanStream | [KantanQES](https://www.kantanai.io/kantanqes-home/) | [KantanAI](/companies#kantanmt) | -| [PhraseTMS](phrase-tms) | [MT quality estimation](https://support.phrase.com/hc/en-us/articles/5709672289180-MT-Quality-Estimation-TMS-), ModelFront Phrase connector | [Phrase QE](/phrase-qe) | +| [PhraseTMS](/phrase-tms) | [MT quality estimation](https://support.phrase.com/hc/en-us/articles/5709672289180-MT-Quality-Estimation-TMS-), ModelFront Phrase connector | [Phrase QE](/phrase-mtqe) | | GlobalDoc LangXpert | [Effort estimation](https://globaldoc.com/about-us/our-technology/langxpert-modelfront-technology-partnership/) | [ModelFront](/modelfront) | | Google Cloud Translation Hub | [Machine translation quality prediction](https://cloud.google.com/translation-hub/docs/translator-edit#segment-details) | [Google Cloud Translation Hub - MTQP](/google-translation-hub-mtqp) | diff --git a/resources/education.md b/resources/education.md index 674d5b944..f81aa0fcc 100644 --- a/resources/education.md +++ b/resources/education.md @@ -51,7 +51,7 @@ This course is no longer offered, it is now a part of the course “Natural lang ## Karlsruhe Institute of Technology The **Machine Translation** course is a massive open online course from Karlsruhe Institute of Technology. -This pre-recorded course is taught by Alexander Waibel, the co-founder of [KITES](/companies#Kites), and Jan Niehus. +This pre-recorded course is taught by Alexander Waibel, the co-founder of [KITES](/companies#kites), and Jan Niehus. [coursera.org/learn/machinetranslation](https://www.coursera.org/learn/machinetranslation) @@ -82,7 +82,7 @@ This interactive class is designed for project managers, vendors, buyers, decisi ## Intento -The **Machine Translation University** is a set of free online learning resources from [Intento](/companies#Intento). +The **Machine Translation University** is a set of free online learning resources from [Intento](/companies#intento). These resources are designed for localization specialists and buyers. [inten.to/machine-translation-university/](https://inten.to/machine-translation-university/) diff --git a/resources/reports.md b/resources/reports.md index cfd0d5362..0007dbfc8 100644 --- a/resources/reports.md +++ b/resources/reports.md @@ -8,7 +8,7 @@ seo: | | | | --- | --- | -| [Phrase](/phrase) | [Q2 2023 Machine Translation Report](https://phrase.com/resources/machine-translation-report/) | +| [Phrase](/phrase-tms) | [Q2 2023 Machine Translation Report](https://phrase.com/resources/machine-translation-report/) | | [Intento](/companies#intento) | [The State of Machine Translation 2023](https://inten.to/machine-translation-report-2023/?utm_campaign=MT%20Report%202023&utm_source=machine_translate) | | [Weglot](/weglot) | [The state of machine translation for websites 2023](https://www.weglot.com/ebooks/state-of-machine-translation-report) | | Phrase | [Q1 2023 Machine Translation Report](https://phrase.com/resources/machine-translation-report/) |