From 411eaddeb5a8f5259cbc86179d0546bf985e88d6 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 15:11:55 -0500 Subject: [PATCH] chores: py support; GHA; pyproject.toml; dev env updates; license yr; pytest; etc (#426) * chores: pyproject.toml; ruff; license yr; pytest; etc * tox -e lint * Fix fork button; remove mentions of Python 2 * Remove translate module and methods * Remove compat module * minor docs updates * remove unused coverage config * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update readme * Remove unused ignores * Update contributing * Remove unnecessary mock dep * add readthedocs config --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .coveragerc | 14 - .github/dependabot.yml | 11 + .github/workflows/build-release.yml | 80 ++ .gitignore | 36 +- .pre-commit-config.yaml | 15 + .readthedocs.yml | 15 + .travis.yml | 40 - CHANGELOG.rst | 14 + CONTRIBUTING.rst | 25 +- LICENSE | 2 +- MANIFEST.in | 3 - README.rst | 28 +- dev-requirements.txt | 5 - docs/_templates/side-primary.html | 49 +- docs/_templates/side-secondary.html | 26 +- docs/_themes/flask_theme_support.py | 147 ++- docs/_themes/kr/layout.html | 39 +- docs/_themes/kr_small/layout.html | 37 +- docs/classifiers.rst | 45 +- docs/conf.py | 73 +- docs/index.rst | 16 +- docs/install.rst | 8 +- docs/quickstart.rst | 25 +- docs/requirements.txt | 4 - pyproject.toml | 100 ++ run_tests.py | 70 - setup.cfg | 20 - setup.py | 68 - src/textblob/__init__.py | 9 + {textblob => src/textblob}/_text.py | 1170 +++++++++++------ {textblob => src/textblob}/base.py | 30 +- {textblob => src/textblob}/blob.py | 393 +++--- {textblob => src/textblob}/classifiers.py | 162 ++- {textblob => src/textblob}/decorators.py | 14 +- .../textblob}/download_corpora.py | 19 +- src/textblob/en/__init__.py | 133 ++ {textblob => src/textblob}/en/en-context.txt | 0 {textblob => src/textblob}/en/en-entities.txt | 0 {textblob => src/textblob}/en/en-lexicon.txt | 0 .../textblob}/en/en-morphology.txt | 0 .../textblob}/en/en-sentiment.xml | 0 {textblob => src/textblob}/en/en-spelling.txt | 0 src/textblob/en/inflect.py | 878 +++++++++++++ .../textblob}/en/np_extractors.py | 145 +- {textblob => src/textblob}/en/parsers.py | 4 +- {textblob => src/textblob}/en/sentiments.py | 51 +- {textblob => src/textblob}/en/taggers.py | 11 +- {textblob => src/textblob}/exceptions.py | 15 +- {textblob => src/textblob}/formats.py | 40 +- src/textblob/inflect.py | 15 + {textblob => src/textblob}/mixins.py | 69 +- {textblob => src/textblob}/np_extractors.py | 8 +- {textblob => src/textblob}/parsers.py | 10 +- src/textblob/sentiments.py | 24 + src/textblob/taggers.py | 17 + {textblob => src/textblob}/tokenizers.py | 32 +- .../textblob}/unicodecsv/__init__.py | 161 ++- {textblob => src/textblob}/utils.py | 11 +- {textblob => src/textblob}/wordnet.py | 1 - tasks.py | 51 - tests/test_blob.py | 1026 +++++++-------- tests/test_classifiers.py | 309 +++-- tests/test_decorators.py | 14 +- tests/test_formats.py | 84 +- tests/test_inflect.py | 25 +- tests/test_np_extractor.py | 41 +- tests/test_parsers.py | 9 +- tests/test_sentiments.py | 64 +- tests/test_taggers.py | 78 +- tests/test_tokenizers.py | 84 +- tests/test_utils.py | 26 +- textblob/__init__.py | 16 - textblob/compat.py | 53 - textblob/en/__init__.py | 139 -- textblob/en/inflect.py | 472 ------- textblob/inflect.py | 17 - textblob/sentiments.py | 22 - textblob/taggers.py | 18 - textblob/translate.py | 149 --- tox.ini | 36 +- 80 files changed, 3853 insertions(+), 3317 deletions(-) delete mode 100644 .coveragerc create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/build-release.yml create mode 100644 .pre-commit-config.yaml create mode 100644 .readthedocs.yml delete mode 100644 .travis.yml delete mode 100644 MANIFEST.in delete mode 100644 dev-requirements.txt delete mode 100644 docs/requirements.txt create mode 100644 pyproject.toml delete mode 100644 run_tests.py delete mode 100644 setup.cfg delete mode 100644 setup.py create mode 100644 src/textblob/__init__.py rename {textblob => src/textblob}/_text.py (54%) rename {textblob => src/textblob}/base.py (86%) rename {textblob => src/textblob}/blob.py (69%) rename {textblob => src/textblob}/classifiers.py (79%) rename {textblob => src/textblob}/decorators.py (77%) rename {textblob => src/textblob}/download_corpora.py (63%) create mode 100644 src/textblob/en/__init__.py rename {textblob => src/textblob}/en/en-context.txt (100%) rename {textblob => src/textblob}/en/en-entities.txt (100%) rename {textblob => src/textblob}/en/en-lexicon.txt (100%) rename {textblob => src/textblob}/en/en-morphology.txt (100%) rename {textblob => src/textblob}/en/en-sentiment.xml (100%) rename {textblob => src/textblob}/en/en-spelling.txt (100%) create mode 100644 src/textblob/en/inflect.py rename {textblob => src/textblob}/en/np_extractors.py (57%) rename {textblob => src/textblob}/en/parsers.py (86%) rename {textblob => src/textblob}/en/sentiments.py (70%) rename {textblob => src/textblob}/en/taggers.py (80%) rename {textblob => src/textblob}/exceptions.py (93%) rename {textblob => src/textblob}/formats.py (88%) create mode 100644 src/textblob/inflect.py rename {textblob => src/textblob}/mixins.py (75%) rename {textblob => src/textblob}/np_extractors.py (74%) rename {textblob => src/textblob}/parsers.py (58%) create mode 100644 src/textblob/sentiments.py create mode 100644 src/textblob/taggers.py rename {textblob => src/textblob}/tokenizers.py (75%) rename {textblob => src/textblob}/unicodecsv/__init__.py (61%) rename {textblob => src/textblob}/utils.py (80%) rename {textblob => src/textblob}/wordnet.py (94%) delete mode 100644 tasks.py delete mode 100644 textblob/__init__.py delete mode 100644 textblob/compat.py delete mode 100644 textblob/en/__init__.py delete mode 100644 textblob/en/inflect.py delete mode 100644 textblob/inflect.py delete mode 100644 textblob/sentiments.py delete mode 100644 textblob/taggers.py delete mode 100644 textblob/translate.py diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 4fb75141..00000000 --- a/.coveragerc +++ /dev/null @@ -1,14 +0,0 @@ -[run] -include = - textblob* -omit = - # Vendorized dependencies - *unicodecsv* - # Pattern.en code - text/en/__init__.py - text/_text.py - text/en/inflect.py - -[report] -exclude_lines = - raise NotImplementedError diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..a04147ca --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: +- package-ecosystem: pip + directory: "/" + schedule: + interval: daily + open-pull-requests-limit: 10 +- package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml new file mode 100644 index 00000000..03d7c80c --- /dev/null +++ b/.github/workflows/build-release.yml @@ -0,0 +1,80 @@ +name: build +on: + push: + branches: ["dev"] + tags: ["*"] + pull_request: + +jobs: + tests: + name: ${{ matrix.name }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - { name: "3.8", python: "3.8", tox: py38 } + - { name: "3.12", python: "3.12", tox: py312 } + - { name: "lowest", python: "3.8", tox: py38-lowest } + steps: + - uses: actions/checkout@v4.0.0 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Download nltk data + run: wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz + - name: Extract nltk data + run: tar -xzvf nltk_data-0.11.0.tar.gz -C ~ + - run: python -m pip install tox + - run: python -m tox -e${{ matrix.tox }} + build: + name: Build package + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install pypa/build + run: python -m pip install build + - name: Build a binary wheel and a source tarball + run: python -m build + - name: Install twine + run: python -m pip install twine + - name: Check build + run: python -m twine check --strict dist/* + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + # this duplicates pre-commit.ci, so only run it on tags + # it guarantees that linting is passing prior to a release + lint-pre-release: + if: startsWith(github.ref, 'refs/tags') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4.0.0 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - run: python -m pip install tox + - run: python -m tox -e lint + publish-to-pypi: + name: PyPI release + if: startsWith(github.ref, 'refs/tags/') + needs: [build, tests, lint-pre-release] + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/textblob + permissions: + id-token: write + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.gitignore b/.gitignore index 972ac8ae..f51e020f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ -### Python ### - *.py[cod] +# virtualenv +.venv/ +venv/ + # C extensions *.so @@ -19,16 +21,17 @@ develop-eggs .installed.cfg lib lib64 -__pycache__ -cover -# Installer logs +# pip pip-log.txt +pip-wheel-metadata # Unit test / coverage reports .coverage .tox nosetests.xml +test-output/ +.pytest_cache # Translations *.mo @@ -38,22 +41,19 @@ nosetests.xml .project .pydevproject -*.bak -.bumpversion.cfg +# Complexity +output/*.html +output/*/index.html -# Docs +# Sphinx docs/_build +README.html -# Pylint -pylintrc - -### Extra models and data ### +# mypy -text/*.pickle -text/en/*.pickle +.mypy_cache -# Readme build -README.html +!tests/.env -.ipynb_checkpoints/ -*.ipynb +# ruff +.ruff_cache diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..3b6c0374 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.1 + hooks: + - id: ruff + - id: ruff-format +- repo: https://github.com/python-jsonschema/check-jsonschema + rev: 0.28.0 + hooks: + - id: check-github-workflows +- repo: https://github.com/asottile/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: [black==23.12.1] diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..4bab2023 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,15 @@ +version: 2 +sphinx: + configuration: docs/conf.py +formats: + - pdf +build: + os: ubuntu-22.04 + tools: + python: "3.11" +python: + install: + - method: pip + path: . + extra_requirements: + - docs diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 747238c7..00000000 --- a/.travis.yml +++ /dev/null @@ -1,40 +0,0 @@ -language: python -python: - - "2.7" - - "3.5" - - "3.6" - - "3.7" - - "3.8" -before_install: - - wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz - - tar -xzvf nltk_data-0.11.0.tar.gz -C ~ -install: - - pip install numpy - - pip install -r dev-requirements.txt - - pip install -U six - - pip install -U . - - if [[ $TRAVIS_PYTHON_VERSION == '3.8' ]]; then pip install -r docs/requirements.txt; - fi -script: - - python run_tests.py - - if [[ $TRAVIS_PYTHON_VERSION == '3.8' ]]; then cd docs && make doctest; fi - -jobs: - include: - - stage: PyPI Release - if: tag IS present - python: "3.6" - env: [] - # Override before_install, install, and script to no-ops - before_install: skip - install: skip - script: echo "Releasing to PyPI..." - after_success: skip - deploy: - provider: pypi - user: sloria - password: - secure: aPoSh6zkeB6PnS77fmoeT/PzB/oeE7aM0g9ZrPd19ZwC5aORtF7/ifDfzYwYWhdyua4fLAzaEu3Z+pk5z644r1Zq8Jxryv18LeFzkzO/Sk/O9LxpJQ+ypbTIIK9Oc5LdQ0qCd5L3RtMV3zIvocvnpryVmkAm/vYBm77rCBFcMxg= - on: - tags: true - distributions: sdist bdist_wheel diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 344b06c3..a6195d4e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,20 @@ Changelog ========= +0.18.0 (unreleased) +------------------- + +Removals: + +- ``TextBlob.translate()`` and ``TextBlob.detect_language``, and ``textblob.translate`` + are removed. Use the official Google Translate API instead (:issue:`215`). +- Remove ``textblob.compat``. + +Support: + +- Support Python 3.8-3.12. Older versions are no longer supported. +- Support nltk>=3.8. + 0.17.1 (2021-10-21) ------------------- diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index cf30d7fa..02cc8bd7 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -57,8 +57,10 @@ To create a new extension for a part-of-speech tagger, sentiment analyzer, noun from textblob.base import BaseTagger + class MyTagger(BaseTagger): def tag(self, text): + pass # Your implementation goes here Language Extensions @@ -102,7 +104,6 @@ Pull Requests - If the pull request adds functionality, it is tested and the docs are updated. - If you've developed an extension, it is on the :ref:`Extensions List `. -- The pull request works on Python 2.7, 3.4, 3.5, 3.6, and PyPy. Use ``tox`` to verify that it does. - You've added yourself to ``AUTHORS.rst``. 4. Submit a pull request to the ``sloria:dev`` branch. @@ -112,34 +113,20 @@ Running tests To run all the tests: :: - $ python run_tests.py + $ pytest To skip slow tests: :: - $ python run_tests.py fast - -To skip tests that require internet: :: - - $ python run_tests.py no-internet - -To get test coverage reports (must have coverage installed): :: - - $ python run_tests.py cover - -To run tests on Python 2.7, 3.4, 3.5, and 3.6 virtual environments (must have each interpreter installed): :: - - $ tox + $ pytest -m 'not slow' Documentation +++++++++++++ Contributions to the documentation are welcome. Documentation is written in `reStructuredText`_ (rST). A quick rST reference can be found `here `_. Builds are powered by Sphinx_. -To build docs: :: - - $ invoke docs -b +To build docs and run in watch mode: :: -The ``-b`` (for "browse") automatically opens up the docs in your browser after building. + $ tox -e watch-docs .. _Sphinx: http://sphinx.pocoo.org/ diff --git a/LICENSE b/LICENSE index 3851c2cc..b20df7ca 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2013-2021 Steven Loria +Copyright Steven Loria and contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 2f8dcd2c..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include *.rst LICENSE NOTICE -recursive-include textblob *.txt -recursive-include textblob *.xml diff --git a/README.rst b/README.rst index 635de50f..a4a07e21 100644 --- a/README.rst +++ b/README.rst @@ -6,20 +6,21 @@ TextBlob: Simplified Text Processing :target: https://pypi.org/project/textblob/ :alt: Latest version -.. image:: https://badgen.net/travis/sloria/TextBlob/dev - :target: https://travis-ci.org/sloria/TextBlob - :alt: Travis-CI +.. image:: https://github.com/sloria/TextBlob/actions/workflows/build-release.yml/badge.svg + :target: https://github.com/sloria/TextBlob/actions/workflows/build-release.yml + :alt: Build status + Homepage: `https://textblob.readthedocs.io/ `_ -`TextBlob` is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. +`TextBlob` is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. .. code-block:: python from textblob import TextBlob - text = ''' + text = """ The titular threat of The Blob has always struck me as the ultimate movie monster: an insatiably hungry, amoeba-like mass able to penetrate virtually any safeguard, capable of--as a doomed doctor chillingly @@ -28,15 +29,15 @@ Homepage: `https://textblob.readthedocs.io/ `_ devastating of potential consequences, not unlike the grey goo scenario proposed by technological theorists fearful of artificial intelligence run rampant. - ''' + """ blob = TextBlob(text) - blob.tags # [('The', 'DT'), ('titular', 'JJ'), - # ('threat', 'NN'), ('of', 'IN'), ...] + blob.tags # [('The', 'DT'), ('titular', 'JJ'), + # ('threat', 'NN'), ('of', 'IN'), ...] - blob.noun_phrases # WordList(['titular threat', 'blob', - # 'ultimate movie monster', - # 'amoeba-like mass', ...]) + blob.noun_phrases # WordList(['titular threat', 'blob', + # 'ultimate movie monster', + # 'amoeba-like mass', ...]) for sentence in blob.sentences: print(sentence.sentiment.polarity) @@ -82,11 +83,6 @@ Documentation Full documentation is available at https://textblob.readthedocs.io/. -Requirements ------------- - -- Python >= 2.7 or >= 3.5 - Project Links ------------- diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index d03a398a..00000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -nose>=1.3.0 -tox>=2.6.0 -invoke>=0.15.0 -mock==3.0.5 -flake8==3.9.2 diff --git a/docs/_templates/side-primary.html b/docs/_templates/side-primary.html index 2842dc3f..ea9e6fb1 100644 --- a/docs/_templates/side-primary.html +++ b/docs/_templates/side-primary.html @@ -1,17 +1,32 @@

- +

-TextBlob is a Python (2 and 3) library for processing textual data. It provides a consistent API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, and more. + TextBlob is a Python (2 and 3) library for processing textual data. It + provides a consistent API for diving into common natural language processing + (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment + analysis, and more.

-

Useful Links

  • TextBlob @ PyPI
  • @@ -21,21 +36,13 @@

    Useful Links

    Stay Informed

    -

    - -

    Donate

    - -

    If you find TextBlob useful, please consider supporting its author:

    -

    - +

    - -

    - Flattr this -

    - -

    Your donation helps move TextBlob forward.

    diff --git a/docs/_templates/side-secondary.html b/docs/_templates/side-secondary.html index 043fe351..037f2f12 100644 --- a/docs/_templates/side-secondary.html +++ b/docs/_templates/side-secondary.html @@ -1,18 +1,32 @@

    - +

    -

    -TextBlob is a Python (2 and 3) library for processing textual data. It provides a consistent API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, and more. + TextBlob is a Python library for processing textual data. It provides a + consistent API for diving into common natural language processing (NLP) tasks + such as part-of-speech tagging, noun phrase extraction, sentiment analysis, + and more.

    -

    Useful Links

    • TextBlob @ PyPI
    • diff --git a/docs/_themes/flask_theme_support.py b/docs/_themes/flask_theme_support.py index 33f47449..64e24996 100755 --- a/docs/_themes/flask_theme_support.py +++ b/docs/_themes/flask_theme_support.py @@ -1,7 +1,19 @@ # flasky extensions. flasky pygments style based on tango style from pygments.style import Style -from pygments.token import Keyword, Name, Comment, String, Error, \ - Number, Operator, Generic, Whitespace, Punctuation, Other, Literal +from pygments.token import ( + Comment, + Error, + Generic, + Keyword, + Literal, + Name, + Number, + Operator, + Other, + Punctuation, + String, + Whitespace, +) class FlaskyStyle(Style): @@ -10,77 +22,68 @@ class FlaskyStyle(Style): styles = { # No corresponding class for the following: - #Text: "", # class: '' - Whitespace: "underline #f8f8f8", # class: 'w' - Error: "#a40000 border:#ef2929", # class: 'err' - Other: "#000000", # class 'x' - - Comment: "italic #8f5902", # class: 'c' - Comment.Preproc: "noitalic", # class: 'cp' - - Keyword: "bold #004461", # class: 'k' - Keyword.Constant: "bold #004461", # class: 'kc' - Keyword.Declaration: "bold #004461", # class: 'kd' - Keyword.Namespace: "bold #004461", # class: 'kn' - Keyword.Pseudo: "bold #004461", # class: 'kp' - Keyword.Reserved: "bold #004461", # class: 'kr' - Keyword.Type: "bold #004461", # class: 'kt' - - Operator: "#582800", # class: 'o' - Operator.Word: "bold #004461", # class: 'ow' - like keywords - - Punctuation: "bold #000000", # class: 'p' - + # Text: "", # class: '' + Whitespace: "underline #f8f8f8", # class: 'w' + Error: "#a40000 border:#ef2929", # class: 'err' + Other: "#000000", # class 'x' + Comment: "italic #8f5902", # class: 'c' + Comment.Preproc: "noitalic", # class: 'cp' + Keyword: "bold #004461", # class: 'k' + Keyword.Constant: "bold #004461", # class: 'kc' + Keyword.Declaration: "bold #004461", # class: 'kd' + Keyword.Namespace: "bold #004461", # class: 'kn' + Keyword.Pseudo: "bold #004461", # class: 'kp' + Keyword.Reserved: "bold #004461", # class: 'kr' + Keyword.Type: "bold #004461", # class: 'kt' + Operator: "#582800", # class: 'o' + Operator.Word: "bold #004461", # class: 'ow' - like keywords + Punctuation: "bold #000000", # class: 'p' # because special names such as Name.Class, Name.Function, etc. # are not recognized as such later in the parsing, we choose them # to look the same as ordinary variables. - Name: "#000000", # class: 'n' - Name.Attribute: "#c4a000", # class: 'na' - to be revised - Name.Builtin: "#004461", # class: 'nb' - Name.Builtin.Pseudo: "#3465a4", # class: 'bp' - Name.Class: "#000000", # class: 'nc' - to be revised - Name.Constant: "#000000", # class: 'no' - to be revised - Name.Decorator: "#888", # class: 'nd' - to be revised - Name.Entity: "#ce5c00", # class: 'ni' - Name.Exception: "bold #cc0000", # class: 'ne' - Name.Function: "#000000", # class: 'nf' - Name.Property: "#000000", # class: 'py' - Name.Label: "#f57900", # class: 'nl' - Name.Namespace: "#000000", # class: 'nn' - to be revised - Name.Other: "#000000", # class: 'nx' - Name.Tag: "bold #004461", # class: 'nt' - like a keyword - Name.Variable: "#000000", # class: 'nv' - to be revised - Name.Variable.Class: "#000000", # class: 'vc' - to be revised - Name.Variable.Global: "#000000", # class: 'vg' - to be revised - Name.Variable.Instance: "#000000", # class: 'vi' - to be revised - - Number: "#990000", # class: 'm' - - Literal: "#000000", # class: 'l' - Literal.Date: "#000000", # class: 'ld' - - String: "#4e9a06", # class: 's' - String.Backtick: "#4e9a06", # class: 'sb' - String.Char: "#4e9a06", # class: 'sc' - String.Doc: "italic #8f5902", # class: 'sd' - like a comment - String.Double: "#4e9a06", # class: 's2' - String.Escape: "#4e9a06", # class: 'se' - String.Heredoc: "#4e9a06", # class: 'sh' - String.Interpol: "#4e9a06", # class: 'si' - String.Other: "#4e9a06", # class: 'sx' - String.Regex: "#4e9a06", # class: 'sr' - String.Single: "#4e9a06", # class: 's1' - String.Symbol: "#4e9a06", # class: 'ss' - - Generic: "#000000", # class: 'g' - Generic.Deleted: "#a40000", # class: 'gd' - Generic.Emph: "italic #000000", # class: 'ge' - Generic.Error: "#ef2929", # class: 'gr' - Generic.Heading: "bold #000080", # class: 'gh' - Generic.Inserted: "#00A000", # class: 'gi' - Generic.Output: "#888", # class: 'go' - Generic.Prompt: "#745334", # class: 'gp' - Generic.Strong: "bold #000000", # class: 'gs' - Generic.Subheading: "bold #800080", # class: 'gu' - Generic.Traceback: "bold #a40000", # class: 'gt' + Name: "#000000", # class: 'n' + Name.Attribute: "#c4a000", # class: 'na' - to be revised + Name.Builtin: "#004461", # class: 'nb' + Name.Builtin.Pseudo: "#3465a4", # class: 'bp' + Name.Class: "#000000", # class: 'nc' - to be revised + Name.Constant: "#000000", # class: 'no' - to be revised + Name.Decorator: "#888", # class: 'nd' - to be revised + Name.Entity: "#ce5c00", # class: 'ni' + Name.Exception: "bold #cc0000", # class: 'ne' + Name.Function: "#000000", # class: 'nf' + Name.Property: "#000000", # class: 'py' + Name.Label: "#f57900", # class: 'nl' + Name.Namespace: "#000000", # class: 'nn' - to be revised + Name.Other: "#000000", # class: 'nx' + Name.Tag: "bold #004461", # class: 'nt' - like a keyword + Name.Variable: "#000000", # class: 'nv' - to be revised + Name.Variable.Class: "#000000", # class: 'vc' - to be revised + Name.Variable.Global: "#000000", # class: 'vg' - to be revised + Name.Variable.Instance: "#000000", # class: 'vi' - to be revised + Number: "#990000", # class: 'm' + Literal: "#000000", # class: 'l' + Literal.Date: "#000000", # class: 'ld' + String: "#4e9a06", # class: 's' + String.Backtick: "#4e9a06", # class: 'sb' + String.Char: "#4e9a06", # class: 'sc' + String.Doc: "italic #8f5902", # class: 'sd' - like a comment + String.Double: "#4e9a06", # class: 's2' + String.Escape: "#4e9a06", # class: 'se' + String.Heredoc: "#4e9a06", # class: 'sh' + String.Interpol: "#4e9a06", # class: 'si' + String.Other: "#4e9a06", # class: 'sx' + String.Regex: "#4e9a06", # class: 'sr' + String.Single: "#4e9a06", # class: 's1' + String.Symbol: "#4e9a06", # class: 'ss' + Generic: "#000000", # class: 'g' + Generic.Deleted: "#a40000", # class: 'gd' + Generic.Emph: "italic #000000", # class: 'ge' + Generic.Error: "#ef2929", # class: 'gr' + Generic.Heading: "bold #000080", # class: 'gh' + Generic.Inserted: "#00A000", # class: 'gi' + Generic.Output: "#888", # class: 'go' + Generic.Prompt: "#745334", # class: 'gp' + Generic.Strong: "bold #000000", # class: 'gs' + Generic.Subheading: "bold #800080", # class: 'gu' + Generic.Traceback: "bold #a40000", # class: 'gt' } diff --git a/docs/_themes/kr/layout.html b/docs/_themes/kr/layout.html index 8ab173df..1b7a4f9a 100755 --- a/docs/_themes/kr/layout.html +++ b/docs/_themes/kr/layout.html @@ -1,18 +1,23 @@ -{%- extends "basic/layout.html" %} -{%- block extrahead %} - {{ super() }} - {% if theme_touch_icon %} - - {% endif %} - -{% endblock %} -{%- block relbar2 %}{% endblock %} -{%- block footer %} - - - Fork me on GitHub - +{%- extends "basic/layout.html" %} {%- block extrahead %} {{ super() }} {% if +theme_touch_icon %} + +{% endif %} + +{% endblock %} {%- block relbar2 %}{% endblock %} {%- block footer %} + + + Fork me on GitHub + -{%- endblock %} \ No newline at end of file +{%- endblock %} diff --git a/docs/_themes/kr_small/layout.html b/docs/_themes/kr_small/layout.html index aa1716aa..b60234dd 100755 --- a/docs/_themes/kr_small/layout.html +++ b/docs/_themes/kr_small/layout.html @@ -1,22 +1,15 @@ -{% extends "basic/layout.html" %} -{% block header %} - {{ super() }} - {% if pagename == 'index' %} -
      - {% endif %} -{% endblock %} -{% block footer %} - {% if pagename == 'index' %} -
      - {% endif %} -{% endblock %} -{# do not display relbars #} -{% block relbar1 %}{% endblock %} -{% block relbar2 %} - {% if theme_github_fork %} - Fork me on GitHub - {% endif %} -{% endblock %} -{% block sidebar1 %}{% endblock %} -{% block sidebar2 %}{% endblock %} +{% extends "basic/layout.html" %} {% block header %} {{ super() }} {% if +pagename == 'index' %} +
      + {% endif %} {% endblock %} {% block footer %} {% if pagename == 'index' %} +
      +{% endif %} {% endblock %} {# do not display relbars #} {% block relbar1 %}{% +endblock %} {% block relbar2 %} {% if theme_github_fork %} +Fork me on GitHub +{% endif %} {% endblock %} {% block sidebar1 %}{% endblock %} {% block sidebar2 +%}{% endblock %} diff --git a/docs/classifiers.rst b/docs/classifiers.rst index 78be38ca..93d7f7bf 100644 --- a/docs/classifiers.rst +++ b/docs/classifiers.rst @@ -16,24 +16,24 @@ First we'll create some training and test data. .. doctest:: >>> train = [ - ... ('I love this sandwich.', 'pos'), - ... ('this is an amazing place!', 'pos'), - ... ('I feel very good about these beers.', 'pos'), - ... ('this is my best work.', 'pos'), - ... ("what an awesome view", 'pos'), - ... ('I do not like this restaurant', 'neg'), - ... ('I am tired of this stuff.', 'neg'), - ... ("I can't deal with this", 'neg'), - ... ('he is my sworn enemy!', 'neg'), - ... ('my boss is horrible.', 'neg') + ... ("I love this sandwich.", "pos"), + ... ("this is an amazing place!", "pos"), + ... ("I feel very good about these beers.", "pos"), + ... ("this is my best work.", "pos"), + ... ("what an awesome view", "pos"), + ... ("I do not like this restaurant", "neg"), + ... ("I am tired of this stuff.", "neg"), + ... ("I can't deal with this", "neg"), + ... ("he is my sworn enemy!", "neg"), + ... ("my boss is horrible.", "neg"), ... ] >>> test = [ - ... ('the beer was good.', 'pos'), - ... ('I do not enjoy my job', 'neg'), - ... ("I ain't feeling dandy today.", 'neg'), - ... ("I feel amazing!", 'pos'), - ... ('Gary is a friend of mine.', 'pos'), - ... ("I can't believe I'm doing this.", 'neg') + ... ("the beer was good.", "pos"), + ... ("I do not enjoy my job", "neg"), + ... ("I ain't feeling dandy today.", "neg"), + ... ("I feel amazing!", "pos"), + ... ("Gary is a friend of mine.", "pos"), + ... ("I can't believe I'm doing this.", "neg"), ... ] Now we'll create a Naive Bayes classifier, passing the training data into the constructor. @@ -154,10 +154,12 @@ Use the ``update(new_data)`` method to update a classifier with new training dat .. doctest:: - >>> new_data = [('She is my best friend.', 'pos'), - ... ("I'm happy to have a new friend.", 'pos'), - ... ("Stay thirsty, my friend.", 'pos'), - ... ("He ain't from around here.", 'neg')] + >>> new_data = [ + ... ("She is my best friend.", "pos"), + ... ("I'm happy to have a new friend.", "pos"), + ... ("Stay thirsty, my friend.", "pos"), + ... ("He ain't from around here.", "neg"), + ... ] >>> cl.update(new_data) True >>> cl.accuracy(test) @@ -185,8 +187,9 @@ For example, let's create a feature extractor that just uses the first and last ... feats["first({0})".format(first_word)] = True ... feats["last({0})".format(last_word)] = False ... return feats + ... >>> features = end_word_extractor("I feel happy") - >>> assert features == {'last(happy)': False, 'first(I)': True} + >>> assert features == {"last(happy)": False, "first(I)": True} We can then use the feature extractor in a classifier by passing it as the second argument of the constructor. diff --git a/docs/conf.py b/docs/conf.py index ccbf4404..76b5dd3a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,13 +1,7 @@ -# -*- coding: utf-8 -*- -import datetime as dt +import importlib.metadata import os import sys -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) -import textblob sys.path.append(os.path.abspath("_themes")) # -- General configuration ----------------------------------------------------- @@ -15,53 +9,50 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.viewcode', - 'sphinx_issues', + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.viewcode", + "sphinx_issues", ] -primary_domain = 'py' -default_role = 'py:obj' +primary_domain = "py" +default_role = "py:obj" -issues_github_path = 'sloria/TextBlob' +issues_github_path = "sloria/TextBlob" # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'TextBlob' -copyright = u'{0:%Y} Steven Loria'.format( - dt.datetime.utcnow() -) +project = "TextBlob" +copyright = 'Steven Loria and contributors' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = release = textblob.__version__ -exclude_patterns = ['_build'] -pygments_style = 'flask_theme_support.FlaskyStyle' -html_theme = 'kr' -html_theme_path = ['_themes'] +version = release = importlib.metadata.version("textblob") +exclude_patterns = ["_build"] +pygments_style = "flask_theme_support.FlaskyStyle" +html_theme = "kr" +html_theme_path = ["_themes"] -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, maps document names to template names. html_sidebars = { - 'index': ['side-primary.html', 'searchbox.html'], - '**': ['side-secondary.html', 'localtoc.html', - 'relations.html', 'searchbox.html'] + "index": ["side-primary.html", "searchbox.html"], + "**": ["side-secondary.html", "localtoc.html", "relations.html", "searchbox.html"], } # Output file base name for HTML help builder. -htmlhelp_basename = 'textblobdoc' +htmlhelp_basename = "textblobdoc" # -- Options for LaTeX output -------------------------------------------------- @@ -69,23 +60,25 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'TextBlob.tex', u'textblob Documentation', - u'Steven Loria', 'manual'), + ("index", "TextBlob.tex", "textblob Documentation", "Steven Loria", "manual"), ] # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'textblob', u'textblob Documentation', - [u'Steven Loria'], 1) -] +man_pages = [("index", "textblob", "textblob Documentation", ["Steven Loria"], 1)] # -- Options for Texinfo output ------------------------------------------------ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'textblob', u'TextBlob Documentation', - u'Steven Loria', 'textblob', 'Simplified Python text-processing.', - 'Natural Language Processing'), + ( + "index", + "textblob", + "TextBlob Documentation", + "Steven Loria", + "textblob", + "Simplified Python text-processing.", + "Natural Language Processing", + ), ] diff --git a/docs/index.rst b/docs/index.rst index 6c5f0ecc..b4c64479 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,14 +8,14 @@ TextBlob: Simplified Text Processing Release v\ |version|. (:ref:`Changelog`) -*TextBlob* is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. +*TextBlob* is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. .. code-block:: python from textblob import TextBlob - text = ''' + text = """ The titular threat of The Blob has always struck me as the ultimate movie monster: an insatiably hungry, amoeba-like mass able to penetrate virtually any safeguard, capable of--as a doomed doctor chillingly @@ -24,15 +24,15 @@ Release v\ |version|. (:ref:`Changelog`) devastating of potential consequences, not unlike the grey goo scenario proposed by technological theorists fearful of artificial intelligence run rampant. - ''' + """ blob = TextBlob(text) - blob.tags # [('The', 'DT'), ('titular', 'JJ'), - # ('threat', 'NN'), ('of', 'IN'), ...] + blob.tags # [('The', 'DT'), ('titular', 'JJ'), + # ('threat', 'NN'), ('of', 'IN'), ...] - blob.noun_phrases # WordList(['titular threat', 'blob', - # 'ultimate movie monster', - # 'amoeba-like mass', ...]) + blob.noun_phrases # WordList(['titular threat', 'blob', + # 'ultimate movie monster', + # 'amoeba-like mass', ...]) for sentence in blob.sentences: print(sentence.sentiment.polarity) diff --git a/docs/install.rst b/docs/install.rst index 2ec9f5da..eeb3baf9 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -79,16 +79,10 @@ Old: from text.taggers import NLTKTagger -Python -++++++ - -TextBlob supports Python >=2.7 or >=3.5. - - Dependencies ++++++++++++ -TextBlob depends on NLTK 3. NLTK will be installed automatically when you run ``pip install textblob`` or ``python setup.py install``. +TextBlob depends on NLTK 3. NLTK will be installed automatically when you run ``pip install textblob``. Some features, such as the maximum entropy classifier, require `numpy`_, but it is not required for basic usage. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 5cf34e20..db820488 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -63,9 +63,11 @@ You can break TextBlobs into words or sentences. .. doctest:: - >>> zen = TextBlob("Beautiful is better than ugly. " - ... "Explicit is better than implicit. " - ... "Simple is better than complex.") + >>> zen = TextBlob( + ... "Beautiful is better than ugly. " + ... "Explicit is better than implicit. " + ... "Simple is better than complex." + ... ) >>> zen.words WordList(['Beautiful', 'is', 'better', 'than', 'ugly', 'Explicit', 'is', 'better', 'than', 'implicit', 'Simple', 'is', 'better', 'than', 'complex']) >>> zen.sentences @@ -89,7 +91,7 @@ object (a subclass of ``unicode``) with useful methods, e.g. for word inflection .. doctest:: - >>> sentence = TextBlob('Use 4 spaces per indentation level.') + >>> sentence = TextBlob("Use 4 spaces per indentation level.") >>> sentence.words WordList(['Use', '4', 'spaces', 'per', 'indentation', 'level']) >>> sentence.words[2].singularize() @@ -136,8 +138,8 @@ You can also create synsets directly. .. doctest:: >>> from textblob.wordnet import Synset - >>> octopus = Synset('octopus.n.02') - >>> shrimp = Synset('shrimp.n.03') + >>> octopus = Synset("octopus.n.02") + >>> shrimp = Synset("shrimp.n.03") >>> octopus.path_similarity(shrimp) 0.1111111111111111 @@ -172,7 +174,7 @@ Use the :meth:`correct() ` method to attempt spelling correcti .. doctest:: >>> from textblob import Word - >>> w = Word('falibility') + >>> w = Word("falibility") >>> w.spellcheck() [('fallibility', 1.0)] @@ -245,18 +247,18 @@ You can make comparisons between TextBlobs and strings. .. doctest:: - >>> apple_blob = TextBlob('apples') - >>> banana_blob = TextBlob('bananas') + >>> apple_blob = TextBlob("apples") + >>> banana_blob = TextBlob("bananas") >>> apple_blob < banana_blob True - >>> apple_blob == 'apples' + >>> apple_blob == "apples" True You can concatenate and interpolate TextBlobs and strings. .. doctest:: - >>> apple_blob + ' and ' + banana_blob + >>> apple_blob + " and " + banana_blob TextBlob("apples and bananas") >>> "{0} and {1}".format(apple_blob, banana_blob) 'apples and bananas' @@ -283,6 +285,7 @@ Use ``sentence.start`` and ``sentence.end`` to get the indices where a sentence >>> for s in zen.sentences: ... print(s) ... print("---- Starts at index {}, Ends at index {}".format(s.start, s.end)) + ... Beautiful is better than ugly. ---- Starts at index 0, Ends at index 30 Explicit is better than implicit. diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index a27149c2..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ - -sphinx==3.5.4 -PyYAML==3.13 -sphinx-issues==1.2.0 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..828e862d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,100 @@ +[project] +name = "TextBlob" +version = "0.17.1" +description = "Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more." +readme = "README.rst" +license = { file = "LICENSE" } +authors = [{ name = "Steven Loria", email = "sloria1@gmail.com" }] +classifiers = [ + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Text Processing :: Linguistic", +] +keywords = ["textblob", "nlp", 'linguistics', 'nltk', 'pattern'] +requires-python = ">=3.8" +dependencies = ["nltk>=3.8"] + +[project.urls] +Changelog = "https://textblob.readthedocs.io/en/latest/changelog.html" +Issues = "https://github.com/sloria/TextBlob/issues" +Source = "https://github.com/sloria/TextBlob" + +[project.optional-dependencies] +docs = ["sphinx==7.2.6", "sphinx-issues==4.0.0", "PyYAML==6.0.1"] +tests = ["pytest", "numpy"] +dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] + +[build-system] +requires = ["flit_core<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.module] +# Needed because import name is `textblob` and package name is `TextBlob` +name = "textblob" + +[tool.flit.sdist] +include = ["tests/", "CHANGELOG.rst", "CONTRIBUTING.rst", "tox.ini"] + +[tool.ruff] +src = ["src"] +fix = true +show-fixes = true +unsafe-fixes = true +exclude = [ + # Default excludes from ruff + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", + # Vendorized code + "src/textblob/en", + "src/textblob/unicodecsv", + "src/textblob/_text.py", +] + +[tool.ruff.lint] +select = [ + "B", # flake8-bugbear + "E", # pycodestyle error + "F", # pyflakes + "I", # isort + "UP", # pyupgrade + "W", # pycodestyle warning +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["E721"] + +[tool.pytest.ini_options] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "numpy: marks tests that require numpy", +] diff --git a/run_tests.py b/run_tests.py deleted file mode 100644 index accf7fdc..00000000 --- a/run_tests.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -''' -The main test runner script. - -Usage: :: - python run_tests.py -Skip slow tests - python run_tests.py fast -When there's no Internet - python run_tests.py no-internet -''' -from __future__ import unicode_literals -import subprocess -import sys - -import nose - -from textblob.compat import PY2 - -PY26 = PY2 and int(sys.version_info[1]) < 7 -PYPY = "PyPy" in sys.version - - -def main(): - args = get_argv() - retcode = subprocess.call(['flake8', 'textblob']) - if retcode: - sys.exit(1) - success = nose.run(argv=args) - sys.exit(0) if success else sys.exit(1) - - -def get_argv(): - args = [sys.argv[0], "tests", '--verbosity', '2'] - attr_conditions = [] # Use nose's attribselect plugin to filter tests - if "force-all" in sys.argv: - # Don't exclude any tests - return args - if "cover" in sys.argv: - args += ["--with-coverage", "--cover-html"] - try: - __import__('numpy') - except ImportError: - # Exclude tests that require numpy - attr_conditions.append("not requires_numpy") - if not PY2: - # Exclude tests that only work on python2 - attr_conditions.append("not py2_only") - if PYPY: - # Exclude tests that don't work on PyPY - attr_conditions.append("not no_pypy") - if "fast" in sys.argv: - attr_conditions.append("not slow") - if "no-internet" in sys.argv: - # Exclude tests that require internet - attr_conditions.append("not requires_internet") - - # Skip tests with the "skip" attribute - attr_conditions.append("not skip") - - attr_expression = " and ".join(attr_conditions) - if attr_expression: - args.extend(["-A", attr_expression]) - return args - - -if __name__ == '__main__': - main() diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 90777339..00000000 --- a/setup.cfg +++ /dev/null @@ -1,20 +0,0 @@ -[bdist_wheel] -universal = 1 - -[flake8] -ignore = E501,E127,E128,E265,E302,E266 -max-line-length = 90 -exclude = - .git, - .ropeproject, - .tox, - docs, - .git, - build, - env, - venv, - # Exclude vendorized code - textblob/en, - textblob/unicodecsv, - textblob/_text.py, - textblob/compat.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 707eef13..00000000 --- a/setup.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import re -from setuptools import setup, find_packages - -REQUIREMENTS = [ - 'nltk>=3.1; python_version >= "3"', - 'nltk>=3.1,<3.5; python_version < "3"', -] -def find_version(fname): - """Attempts to find the version number in the file names fname. - Raises RuntimeError if not found. - """ - version = '' - with open(fname, 'r') as fp: - reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]') - for line in fp: - m = reg.match(line) - if m: - version = m.group(1) - break - if not version: - raise RuntimeError('Cannot find version information') - return version - - -__version__ = find_version('textblob/__init__.py') - - -def read(fname): - with open(fname) as fp: - content = fp.read() - return content - - -setup( - name='textblob', - version=__version__, - description='Simple, Pythonic text processing. Sentiment analysis, ' - 'part-of-speech tagging, noun phrase parsing, and more.', - long_description=read("README.rst"), - license='MIT', - author='Steven Loria', - author_email='sloria1@gmail.com', - url='https://github.com/sloria/TextBlob', - install_requires=REQUIREMENTS, - packages=find_packages(exclude=('test*', )), - include_package_data=True, - zip_safe=False, - package_data={ - "textblob.en": ["*.txt", "*.xml"] - }, - classifiers=( - 'Intended Audience :: Developers', - 'Natural Language :: English', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: Implementation :: CPython', - 'Programming Language :: Python :: Implementation :: PyPy', - "Topic :: Text Processing :: Linguistic", - ), - keywords=["textblob", "nlp", 'linguistics', 'nltk', 'pattern'] -) diff --git a/src/textblob/__init__.py b/src/textblob/__init__.py new file mode 100644 index 00000000..7589f6d0 --- /dev/null +++ b/src/textblob/__init__.py @@ -0,0 +1,9 @@ +from .blob import Blobber, Sentence, TextBlob, Word, WordList + +__all__ = [ + "TextBlob", + "Word", + "Sentence", + "Blobber", + "WordList", +] diff --git a/textblob/_text.py b/src/textblob/_text.py similarity index 54% rename from textblob/_text.py rename to src/textblob/_text.py index 152d0e12..d247c397 100644 --- a/textblob/_text.py +++ b/src/textblob/_text.py @@ -1,51 +1,55 @@ -# -*- coding: utf-8 -*- """This file is adapted from the pattern library. URL: http://www.clips.ua.ac.be/pages/pattern-web Licence: BSD """ -from __future__ import unicode_literals -import string import codecs -from itertools import chain -import types import os import re +import string +import types +from itertools import chain from xml.etree import cElementTree -from .compat import text_type, basestring, imap, unicode, binary_type, PY2 +basestring = (str, bytes) try: MODULE = os.path.dirname(os.path.abspath(__file__)) except: MODULE = "" -SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA = \ - "&slash;", "word", "part-of-speech", "chunk", "preposition", "relation", "anchor", "lemma" +SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA = ( + "&slash;", + "word", + "part-of-speech", + "chunk", + "preposition", + "relation", + "anchor", + "lemma", +) # String functions def decode_string(v, encoding="utf-8"): - """ Returns the given value as a Unicode string (if possible). - """ + """Returns the given value as a Unicode string (if possible).""" if isinstance(encoding, basestring): encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore")) - if isinstance(v, binary_type): + if isinstance(v, bytes): for e in encoding: try: return v.decode(*e) except: pass return v - return unicode(v) + return str(v) def encode_string(v, encoding="utf-8"): - """ Returns the given value as a Python byte string (if possible). - """ + """Returns the given value as a Python byte string (if possible).""" if isinstance(encoding, basestring): encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore")) - if isinstance(v, unicode): + if isinstance(v, str): for e in encoding: try: return v.encode(*e) @@ -54,6 +58,7 @@ def encode_string(v, encoding="utf-8"): return v return str(v) + decode_utf8 = decode_string encode_utf8 = encode_string @@ -65,21 +70,21 @@ def isnumeric(strg): return False return True -#--- LAZY DICTIONARY ------------------------------------------------------------------------------- + +# --- LAZY DICTIONARY ------------------------------------------------------------------------------- # A lazy dictionary is empty until one of its methods is called. # This way many instances (e.g., lexicons) can be created without using memory until used. class lazydict(dict): - def load(self): # Must be overridden in a subclass. # Must load data with dict.__setitem__(self, k, v) instead of lazydict[k] = v. pass def _lazy(self, method, *args): - """ If the dictionary is empty, calls lazydict.load(). - Replaces lazydict.method() with dict.method() and calls it. + """If the dictionary is empty, calls lazydict.load(). + Replaces lazydict.method() with dict.method() and calls it. """ if dict.__len__(self) == 0: self.load() @@ -88,43 +93,56 @@ def _lazy(self, method, *args): def __repr__(self): return self._lazy("__repr__") + def __len__(self): return self._lazy("__len__") + def __iter__(self): return self._lazy("__iter__") + def __contains__(self, *args): return self._lazy("__contains__", *args) + def __getitem__(self, *args): return self._lazy("__getitem__", *args) + def __setitem__(self, *args): return self._lazy("__setitem__", *args) + def setdefault(self, *args): return self._lazy("setdefault", *args) + def get(self, *args, **kwargs): return self._lazy("get", *args) + def items(self): return self._lazy("items") + def keys(self): return self._lazy("keys") + def values(self): return self._lazy("values") + def update(self, *args): return self._lazy("update", *args) + def pop(self, *args): return self._lazy("pop", *args) + def popitem(self, *args): return self._lazy("popitem", *args) -class lazylist(list): +class lazylist(list): def load(self): # Must be overridden in a subclass. # Must load data with list.append(self, v) instead of lazylist.append(v). pass def _lazy(self, method, *args): - """ If the list is empty, calls lazylist.load(). - Replaces lazylist.method() with list.method() and calls it. + """If the list is empty, calls lazylist.load(). + Replaces lazylist.method() with list.method() and calls it. """ if list.__len__(self) == 0: self.load() @@ -133,24 +151,33 @@ def _lazy(self, method, *args): def __repr__(self): return self._lazy("__repr__") + def __len__(self): return self._lazy("__len__") + def __iter__(self): return self._lazy("__iter__") + def __contains__(self, *args): return self._lazy("__contains__", *args) + def insert(self, *args): return self._lazy("insert", *args) + def append(self, *args): return self._lazy("append", *args) + def extend(self, *args): return self._lazy("extend", *args) + def remove(self, *args): return self._lazy("remove", *args) + def pop(self, *args): return self._lazy("pop", *args) -#--- UNIVERSAL TAGSET ------------------------------------------------------------------------------ + +# --- UNIVERSAL TAGSET ------------------------------------------------------------------------------ # The default part-of-speech tagset used in Pattern is Penn Treebank II. # However, not all languages are well-suited to Penn Treebank (which was developed for English). # As more languages are implemented, this is becoming more problematic. @@ -165,14 +192,28 @@ def pop(self, *args): UNIVERSAL = "universal" -NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X = \ - "NN", "VB", "JJ", "RB", "PR", "DT", "PP", "PP", "NO", "CJ", "UH", "PT", ".", "X" +NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X = ( + "NN", + "VB", + "JJ", + "RB", + "PR", + "DT", + "PP", + "PP", + "NO", + "CJ", + "UH", + "PT", + ".", + "X", +) + def penntreebank2universal(token, tag): - """ Returns a (token, tag)-tuple with a simplified universal part-of-speech tag. - """ + """Returns a (token, tag)-tuple with a simplified universal part-of-speech tag.""" if tag.startswith(("NNP-", "NNPS-")): - return (token, "%s-%s" % (NOUN, tag.split("-")[-1])) + return (token, "{}-{}".format(NOUN, tag.split("-")[-1])) if tag in ("NN", "NNS", "NNP", "NNPS", "NP"): return (token, NOUN) if tag in ("MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): @@ -195,45 +236,97 @@ def penntreebank2universal(token, tag): return (token, INTJ) if tag in ("POS", "RP", "TO"): return (token, PRT) - if tag in ("SYM", "LS", ".", "!", "?", ",", ":", "(", ")", "\"", "#", "$"): + if tag in ("SYM", "LS", ".", "!", "?", ",", ":", "(", ")", '"', "#", "$"): return (token, PUNC) return (token, X) -#--- TOKENIZER ------------------------------------------------------------------------------------- + +# --- TOKENIZER ------------------------------------------------------------------------------------- TOKEN = re.compile(r"(\S+)\s") # Handle common punctuation marks. -PUNCTUATION = \ -punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~_" +PUNCTUATION = punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~_" # Handle common abbreviations. -ABBREVIATIONS = abbreviations = set(( - "a.", "adj.", "adv.", "al.", "a.m.", "c.", "cf.", "comp.", "conf.", "def.", - "ed.", "e.g.", "esp.", "etc.", "ex.", "f.", "fig.", "gen.", "id.", "i.e.", - "int.", "l.", "m.", "Med.", "Mil.", "Mr.", "n.", "n.q.", "orig.", "pl.", - "pred.", "pres.", "p.m.", "ref.", "v.", "vs.", "w/" -)) - -RE_ABBR1 = re.compile("^[A-Za-z]\.$") # single letter, "T. De Smedt" -RE_ABBR2 = re.compile("^([A-Za-z]\.)+$") # alternating letters, "U.S." -RE_ABBR3 = re.compile("^[A-Z][" + "|".join( # capital followed by consonants, "Mr." - "bcdfghjklmnpqrstvwxz") + "]+.$") +ABBREVIATIONS = abbreviations = set( + ( + "a.", + "adj.", + "adv.", + "al.", + "a.m.", + "c.", + "cf.", + "comp.", + "conf.", + "def.", + "ed.", + "e.g.", + "esp.", + "etc.", + "ex.", + "f.", + "fig.", + "gen.", + "id.", + "i.e.", + "int.", + "l.", + "m.", + "Med.", + "Mil.", + "Mr.", + "n.", + "n.q.", + "orig.", + "pl.", + "pred.", + "pres.", + "p.m.", + "ref.", + "v.", + "vs.", + "w/", + ) +) + +RE_ABBR1 = re.compile(r"^[A-Za-z]\.$") # single letter, "T. De Smedt" +RE_ABBR2 = re.compile(r"^([A-Za-z]\.)+$") # alternating letters, "U.S." +RE_ABBR3 = re.compile( + "^[A-Z][" + + "|".join( # capital followed by consonants, "Mr." + "bcdfghjklmnpqrstvwxz" + ) + + "]+.$" +) # Handle emoticons. -EMOTICONS = { # (facial expression, sentiment)-keys - ("love" , +1.00): set(("<3", "♥")), - ("grin" , +1.00): set((">:D", ":-D", ":D", "=-D", "=D", "X-D", "x-D", "XD", "xD", "8-D")), - ("taunt", +0.75): set((">:P", ":-P", ":P", ":-p", ":p", ":-b", ":b", ":c)", ":o)", ":^)")), - ("smile", +0.50): set((">:)", ":-)", ":)", "=)", "=]", ":]", ":}", ":>", ":3", "8)", "8-)")), - ("wink" , +0.25): set((">;]", ";-)", ";)", ";-]", ";]", ";D", ";^)", "*-)", "*)")), - ("gasp" , +0.05): set((">:o", ":-O", ":O", ":o", ":-o", "o_O", "o.O", "°O°", "°o°")), - ("worry", -0.25): set((">:/", ":-/", ":/", ":\\", ">:\\", ":-.", ":-s", ":s", ":S", ":-S", ">.>")), - ("frown", -0.75): set((">:[", ":-(", ":(", "=(", ":-[", ":[", ":{", ":-<", ":c", ":-c", "=/")), - ("cry" , -1.00): set((":'(", ":'''(", ";'(")) +EMOTICONS = { # (facial expression, sentiment)-keys + ("love", +1.00): set(("<3", "♥")), + ("grin", +1.00): set( + (">:D", ":-D", ":D", "=-D", "=D", "X-D", "x-D", "XD", "xD", "8-D") + ), + ("taunt", +0.75): set( + (">:P", ":-P", ":P", ":-p", ":p", ":-b", ":b", ":c)", ":o)", ":^)") + ), + ("smile", +0.50): set( + (">:)", ":-)", ":)", "=)", "=]", ":]", ":}", ":>", ":3", "8)", "8-)") + ), + ("wink", +0.25): set((">;]", ";-)", ";)", ";-]", ";]", ";D", ";^)", "*-)", "*)")), + ("gasp", +0.05): set((">:o", ":-O", ":O", ":o", ":-o", "o_O", "o.O", "°O°", "°o°")), + ("worry", -0.25): set( + (">:/", ":-/", ":/", ":\\", ">:\\", ":-.", ":-s", ":s", ":S", ":-S", ">.>") + ), + ("frown", -0.75): set( + (">:[", ":-(", ":(", "=(", ":-[", ":[", ":{", ":-<", ":c", ":-c", "=/") + ), + ("cry", -1.00): set((":'(", ":'''(", ";'(")), } -RE_EMOTICONS = [r" ?".join([re.escape(each) for each in e]) for v in EMOTICONS.values() for e in v] +RE_EMOTICONS = [ + r" ?".join([re.escape(each) for each in e]) for v in EMOTICONS.values() for e in v +] RE_EMOTICONS = re.compile(r"(%s)($|\s)" % "|".join(RE_EMOTICONS)) # Handle sarcasm punctuation (!). @@ -241,23 +334,30 @@ def penntreebank2universal(token, tag): # Handle common contractions. replacements = { - "'d": " 'd", - "'m": " 'm", - "'s": " 's", + "'d": " 'd", + "'m": " 'm", + "'s": " 's", "'ll": " 'll", "'re": " 're", "'ve": " 've", - "n't": " n't" + "n't": " n't", } # Handle paragraph line breaks (\n\n marks end of sentence). EOS = "END-OF-SENTENCE" -def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace=replacements, linebreak=r"\n{2,}"): - """ Returns a list of sentences. Each sentence is a space-separated string of tokens (words). - Handles common cases of abbreviations (e.g., etc., ...). - Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence. - Headings without an ending period are inferred by line breaks. + +def find_tokens( + string, + punctuation=PUNCTUATION, + abbreviations=ABBREVIATIONS, + replace=replacements, + linebreak=r"\n{2,}", +): + """Returns a list of sentences. Each sentence is a space-separated string of tokens (words). + Handles common cases of abbreviations (e.g., etc., ...). + Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence. + Headings without an ending period are inferred by line breaks. """ # Handle periods separately. punctuation = tuple(punctuation.replace(".", "")) @@ -265,43 +365,50 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re for a, b in list(replace.items()): string = re.sub(a, b, string) # Handle Unicode quotes. - if isinstance(string, unicode): - string = unicode(string).replace("“", " “ ")\ - .replace("”", " ” ")\ - .replace("‘", " ‘ ")\ - .replace("’", " ’ ")\ - .replace("'", " ' ")\ - .replace('"', ' " ') + if isinstance(string, str): + string = ( + str(string) + .replace("“", " “ ") + .replace("”", " ” ") + .replace("‘", " ‘ ") + .replace("’", " ’ ") + .replace("'", " ' ") + .replace('"', ' " ') + ) # Collapse whitespace. string = re.sub("\r\n", "\n", string) string = re.sub(linebreak, " %s " % EOS, string) string = re.sub(r"\s+", " ", string) tokens = [] - for t in TOKEN.findall(string+" "): + for t in TOKEN.findall(string + " "): if len(t) > 0: tail = [] - while t.startswith(punctuation) and \ - not t in replace: + while t.startswith(punctuation) and t not in replace: # Split leading punctuation. if t.startswith(punctuation): - tokens.append(t[0]); t=t[1:] - while t.endswith(punctuation+(".",)) and \ - not t in replace: + tokens.append(t[0]) + t = t[1:] + while t.endswith(punctuation + (".",)) and t not in replace: # Split trailing punctuation. if t.endswith(punctuation): - tail.append(t[-1]); t=t[:-1] + tail.append(t[-1]) + t = t[:-1] # Split ellipsis (...) before splitting period. if t.endswith("..."): - tail.append("..."); t=t[:-3].rstrip(".") + tail.append("...") + t = t[:-3].rstrip(".") # Split period (if not an abbreviation). if t.endswith("."): - if t in abbreviations or \ - RE_ABBR1.match(t) is not None or \ - RE_ABBR2.match(t) is not None or \ - RE_ABBR3.match(t) is not None: + if ( + t in abbreviations + or RE_ABBR1.match(t) is not None + or RE_ABBR2.match(t) is not None + or RE_ABBR3.match(t) is not None + ): break else: - tail.append(t[-1]); t=t[:-1] + tail.append(t[-1]) + t = t[:-1] if t != "": tokens.append(t) tokens.extend(reversed(tail)) @@ -309,9 +416,19 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re while j < len(tokens): if tokens[j] in ("...", ".", "!", "?", EOS): # Handle citations, trailing parenthesis, repeated punctuation (!?). - while j < len(tokens) \ - and tokens[j] in ("'", "\"", u"”", u"’", "...", ".", "!", "?", ")", EOS): - if tokens[j] in ("'", "\"") and sentences[-1].count(tokens[j]) % 2 == 0: + while j < len(tokens) and tokens[j] in ( + "'", + '"', + "”", + "’", + "...", + ".", + "!", + "?", + ")", + EOS, + ): + if tokens[j] in ("'", '"') and sentences[-1].count(tokens[j]) % 2 == 0: break # Balanced quotes. j += 1 sentences[-1].extend(t for t in tokens[i:j] if t != EOS) @@ -321,13 +438,16 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re sentences[-1].extend(tokens[i:j]) sentences = (" ".join(s) for s in sentences if len(s) > 0) sentences = (RE_SARCASM.sub("(!)", s) for s in sentences) - sentences = [RE_EMOTICONS.sub( - lambda m: m.group(1).replace(" ", "") + m.group(2), s) for s in sentences] + sentences = [ + RE_EMOTICONS.sub(lambda m: m.group(1).replace(" ", "") + m.group(2), s) + for s in sentences + ] return sentences + #### LEXICON ####################################################################################### -#--- LEXICON --------------------------------------------------------------------------------------- +# --- LEXICON --------------------------------------------------------------------------------------- # Pattern's text parsers are based on Brill's algorithm. # Brill's algorithm automatically acquires a lexicon of known words, # and a set of rules for tagging unknown words from a training corpus. @@ -337,16 +457,13 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re def _read(path, encoding="utf-8", comment=";;;"): - """ Returns an iterator over the lines in the file at the given path, - stripping comments and decoding each line to Unicode. + """Returns an iterator over the lines in the file at the given path, + stripping comments and decoding each line to Unicode. """ if path: if isinstance(path, basestring) and os.path.exists(path): # From file path. - if PY2: - f = codecs.open(path, 'r', encoding='utf-8') - else: - f = open(path, 'r', encoding='utf-8') + f = open(path, encoding="utf-8") elif isinstance(path, basestring): # From string. f = path.splitlines() @@ -356,7 +473,11 @@ def _read(path, encoding="utf-8", comment=";;;"): else: f = path for i, line in enumerate(f): - line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(line, binary_type) else line + line = ( + line.strip(codecs.BOM_UTF8) + if i == 0 and isinstance(line, bytes) + else line + ) line = line.strip() line = decode_utf8(line) if not line or (comment and line.startswith(comment)): @@ -366,16 +487,23 @@ def _read(path, encoding="utf-8", comment=";;;"): class Lexicon(lazydict): - - def __init__(self, path="", morphology=None, context=None, entities=None, NNP="NNP", language=None): - """ A dictionary of words and their part-of-speech tags. - For unknown words, rules for word morphology, context and named entities can be used. + def __init__( + self, + path="", + morphology=None, + context=None, + entities=None, + NNP="NNP", + language=None, + ): + """A dictionary of words and their part-of-speech tags. + For unknown words, rules for word morphology, context and named entities can be used. """ self._path = path - self._language = language + self._language = language self.morphology = Morphology(self, path=morphology) - self.context = Context(self, path=context) - self.entities = Entities(self, path=entities, tag=NNP) + self.context = Context(self, path=context) + self.entities = Entities(self, path=entities, tag=NNP) def load(self): # Arnold NNP x @@ -390,35 +518,40 @@ def language(self): return self._language -#--- MORPHOLOGICAL RULES --------------------------------------------------------------------------- +# --- MORPHOLOGICAL RULES --------------------------------------------------------------------------- # Brill's algorithm generates lexical (i.e., morphological) rules in the following format: # NN s fhassuf 1 NNS x => unknown words ending in -s and tagged NN change to NNS. # ly hassuf 2 RB x => unknown words ending in -ly change to RB. -class Rules: - def __init__(self, lexicon={}, cmd={}): +class Rules: + def __init__(self, lexicon=None, cmd=None): + if cmd is None: + cmd = {} + if lexicon is None: + lexicon = {} self.lexicon, self.cmd = lexicon, cmd def apply(self, x): - """ Applies the rule to the given token or list of tokens. - """ + """Applies the rule to the given token or list of tokens.""" return x -class Morphology(lazylist, Rules): - def __init__(self, lexicon={}, path=""): - """ A list of rules based on word morphology (prefix, suffix). - """ - cmd = ("char", # Word contains x. - "haspref", # Word starts with x. - "hassuf", # Word end with x. - "addpref", # x + word is in lexicon. - "addsuf", # Word + x is in lexicon. - "deletepref", # Word without x at the start is in lexicon. - "deletesuf", # Word without x at the end is in lexicon. - "goodleft", # Word preceded by word x. - "goodright", # Word followed by word x. +class Morphology(lazylist, Rules): + def __init__(self, lexicon=None, path=""): + """A list of rules based on word morphology (prefix, suffix).""" + if lexicon is None: + lexicon = {} + cmd = ( + "char", # Word contains x. + "haspref", # Word starts with x. + "hassuf", # Word end with x. + "addpref", # x + word is in lexicon. + "addsuf", # Word + x is in lexicon. + "deletepref", # Word without x at the start is in lexicon. + "deletesuf", # Word without x at the end is in lexicon. + "goodleft", # Word preceded by word x. + "goodright", # Word followed by word x. ) cmd = dict.fromkeys(cmd, True) cmd.update(("f" + k, v) for k, v in list(cmd.items())) @@ -434,31 +567,40 @@ def load(self): list.extend(self, (x.split() for x in _read(self._path))) def apply(self, token, previous=(None, None), next=(None, None)): - """ Applies lexical rules to the given token, which is a [word, tag] list. - """ + """Applies lexical rules to the given token, which is a [word, tag] list.""" w = token[0] for r in self: - if r[1] in self.cmd: # Rule = ly hassuf 2 RB x + if r[1] in self.cmd: # Rule = ly hassuf 2 RB x f, x, pos, cmd = bool(0), r[0], r[-2], r[1].lower() - if r[2] in self.cmd: # Rule = NN s fhassuf 1 NNS x + if r[2] in self.cmd: # Rule = NN s fhassuf 1 NNS x f, x, pos, cmd = bool(1), r[1], r[-2], r[2].lower().lstrip("f") if f and token[1] != r[0]: continue - if (cmd == "char" and x in w) \ - or (cmd == "haspref" and w.startswith(x)) \ - or (cmd == "hassuf" and w.endswith(x)) \ - or (cmd == "addpref" and x + w in self.lexicon) \ - or (cmd == "addsuf" and w + x in self.lexicon) \ - or (cmd == "deletepref" and w.startswith(x) and w[len(x):] in self.lexicon) \ - or (cmd == "deletesuf" and w.endswith(x) and w[:-len(x)] in self.lexicon) \ - or (cmd == "goodleft" and x == next[0]) \ - or (cmd == "goodright" and x == previous[0]): + if ( + (cmd == "char" and x in w) + or (cmd == "haspref" and w.startswith(x)) + or (cmd == "hassuf" and w.endswith(x)) + or (cmd == "addpref" and x + w in self.lexicon) + or (cmd == "addsuf" and w + x in self.lexicon) + or ( + cmd == "deletepref" + and w.startswith(x) + and w[len(x) :] in self.lexicon + ) + or ( + cmd == "deletesuf" + and w.endswith(x) + and w[: -len(x)] in self.lexicon + ) + or (cmd == "goodleft" and x == next[0]) + or (cmd == "goodright" and x == previous[0]) + ): token[1] = pos return token def insert(self, i, tag, affix, cmd="hassuf", tagged=None): - """ Inserts a new rule that assigns the given tag to words with the given affix, - e.g., Morphology.append("RB", "-ly"). + """Inserts a new rule that assigns the given tag to words with the given affix, + e.g., Morphology.append("RB", "-ly"). """ if affix.startswith("-") and affix.endswith("-"): affix, cmd = affix[+1:-1], "char" @@ -467,54 +609,59 @@ def insert(self, i, tag, affix, cmd="hassuf", tagged=None): if affix.endswith("-"): affix, cmd = affix[+0:-1], "haspref" if tagged: - r = [tagged, affix, "f"+cmd.lstrip("f"), tag, "x"] + r = [tagged, affix, "f" + cmd.lstrip("f"), tag, "x"] else: r = [affix, cmd.lstrip("f"), tag, "x"] lazylist.insert(self, i, r) def append(self, *args, **kwargs): - self.insert(len(self)-1, *args, **kwargs) + self.insert(len(self) - 1, *args, **kwargs) - def extend(self, rules=[]): + def extend(self, rules=None): + if rules is None: + rules = [] for r in rules: self.append(*r) -#--- CONTEXT RULES --------------------------------------------------------------------------------- + +# --- CONTEXT RULES --------------------------------------------------------------------------------- # Brill's algorithm generates contextual rules in the following format: # VBD VB PREVTAG TO => unknown word tagged VBD changes to VB if preceded by a word tagged TO. -class Context(lazylist, Rules): - def __init__(self, lexicon={}, path=""): - """ A list of rules based on context (preceding and following words). - """ - cmd = ("prevtag", # Preceding word is tagged x. - "nexttag", # Following word is tagged x. - "prev2tag", # Word 2 before is tagged x. - "next2tag", # Word 2 after is tagged x. - "prev1or2tag", # One of 2 preceding words is tagged x. - "next1or2tag", # One of 2 following words is tagged x. - "prev1or2or3tag", # One of 3 preceding words is tagged x. - "next1or2or3tag", # One of 3 following words is tagged x. - "surroundtag", # Preceding word is tagged x and following word is tagged y. - "curwd", # Current word is x. - "prevwd", # Preceding word is x. - "nextwd", # Following word is x. - "prev1or2wd", # One of 2 preceding words is x. - "next1or2wd", # One of 2 following words is x. - "next1or2or3wd", # One of 3 preceding words is x. - "prev1or2or3wd", # One of 3 following words is x. - "prevwdtag", # Preceding word is x and tagged y. - "nextwdtag", # Following word is x and tagged y. - "wdprevtag", # Current word is y and preceding word is tagged x. - "wdnexttag", # Current word is x and following word is tagged y. - "wdand2aft", # Current word is x and word 2 after is y. - "wdand2tagbfr", # Current word is y and word 2 before is tagged x. - "wdand2tagaft", # Current word is x and word 2 after is tagged y. - "lbigram", # Current word is y and word before is x. - "rbigram", # Current word is x and word after is y. - "prevbigram", # Preceding word is tagged x and word before is tagged y. - "nextbigram", # Following word is tagged x and word after is tagged y. +class Context(lazylist, Rules): + def __init__(self, lexicon=None, path=""): + """A list of rules based on context (preceding and following words).""" + if lexicon is None: + lexicon = {} + cmd = ( + "prevtag", # Preceding word is tagged x. + "nexttag", # Following word is tagged x. + "prev2tag", # Word 2 before is tagged x. + "next2tag", # Word 2 after is tagged x. + "prev1or2tag", # One of 2 preceding words is tagged x. + "next1or2tag", # One of 2 following words is tagged x. + "prev1or2or3tag", # One of 3 preceding words is tagged x. + "next1or2or3tag", # One of 3 following words is tagged x. + "surroundtag", # Preceding word is tagged x and following word is tagged y. + "curwd", # Current word is x. + "prevwd", # Preceding word is x. + "nextwd", # Following word is x. + "prev1or2wd", # One of 2 preceding words is x. + "next1or2wd", # One of 2 following words is x. + "next1or2or3wd", # One of 3 preceding words is x. + "prev1or2or3wd", # One of 3 following words is x. + "prevwdtag", # Preceding word is x and tagged y. + "nextwdtag", # Following word is x and tagged y. + "wdprevtag", # Current word is y and preceding word is tagged x. + "wdnexttag", # Current word is x and following word is tagged y. + "wdand2aft", # Current word is x and word 2 after is y. + "wdand2tagbfr", # Current word is y and word 2 before is tagged x. + "wdand2tagaft", # Current word is x and word 2 after is tagged y. + "lbigram", # Current word is y and word before is x. + "rbigram", # Current word is x and word after is y. + "prevbigram", # Preceding word is tagged x and word before is tagged y. + "nextbigram", # Following word is tagged x and word after is tagged y. ) Rules.__init__(self, lexicon, dict.fromkeys(cmd, True)) self._path = path @@ -528,10 +675,10 @@ def load(self): list.extend(self, (x.split() for x in _read(self._path))) def apply(self, tokens): - """ Applies contextual rules to the given list of tokens, - where each token is a [word, tag] list. + """Applies contextual rules to the given list of tokens, + where each token is a [word, tag] list. """ - o = [("STAART", "STAART")] * 3 # Empty delimiters for look ahead/back. + o = [("STAART", "STAART")] * 3 # Empty delimiters for look ahead/back. t = o + tokens + o for i, token in enumerate(t): for r in self: @@ -541,70 +688,86 @@ def apply(self, tokens): continue cmd, x, y = r[2], r[3], r[4] if len(r) > 4 else "" cmd = cmd.lower() - if (cmd == "prevtag" and x == t[i-1][1]) \ - or (cmd == "nexttag" and x == t[i+1][1]) \ - or (cmd == "prev2tag" and x == t[i-2][1]) \ - or (cmd == "next2tag" and x == t[i+2][1]) \ - or (cmd == "prev1or2tag" and x in (t[i-1][1], t[i-2][1])) \ - or (cmd == "next1or2tag" and x in (t[i+1][1], t[i+2][1])) \ - or (cmd == "prev1or2or3tag" and x in (t[i-1][1], t[i-2][1], t[i-3][1])) \ - or (cmd == "next1or2or3tag" and x in (t[i+1][1], t[i+2][1], t[i+3][1])) \ - or (cmd == "surroundtag" and x == t[i-1][1] and y == t[i+1][1]) \ - or (cmd == "curwd" and x == t[i+0][0]) \ - or (cmd == "prevwd" and x == t[i-1][0]) \ - or (cmd == "nextwd" and x == t[i+1][0]) \ - or (cmd == "prev1or2wd" and x in (t[i-1][0], t[i-2][0])) \ - or (cmd == "next1or2wd" and x in (t[i+1][0], t[i+2][0])) \ - or (cmd == "prevwdtag" and x == t[i-1][0] and y == t[i-1][1]) \ - or (cmd == "nextwdtag" and x == t[i+1][0] and y == t[i+1][1]) \ - or (cmd == "wdprevtag" and x == t[i-1][1] and y == t[i+0][0]) \ - or (cmd == "wdnexttag" and x == t[i+0][0] and y == t[i+1][1]) \ - or (cmd == "wdand2aft" and x == t[i+0][0] and y == t[i+2][0]) \ - or (cmd == "wdand2tagbfr" and x == t[i-2][1] and y == t[i+0][0]) \ - or (cmd == "wdand2tagaft" and x == t[i+0][0] and y == t[i+2][1]) \ - or (cmd == "lbigram" and x == t[i-1][0] and y == t[i+0][0]) \ - or (cmd == "rbigram" and x == t[i+0][0] and y == t[i+1][0]) \ - or (cmd == "prevbigram" and x == t[i-2][1] and y == t[i-1][1]) \ - or (cmd == "nextbigram" and x == t[i+1][1] and y == t[i+2][1]): + if ( + (cmd == "prevtag" and x == t[i - 1][1]) + or (cmd == "nexttag" and x == t[i + 1][1]) + or (cmd == "prev2tag" and x == t[i - 2][1]) + or (cmd == "next2tag" and x == t[i + 2][1]) + or (cmd == "prev1or2tag" and x in (t[i - 1][1], t[i - 2][1])) + or (cmd == "next1or2tag" and x in (t[i + 1][1], t[i + 2][1])) + or ( + cmd == "prev1or2or3tag" + and x in (t[i - 1][1], t[i - 2][1], t[i - 3][1]) + ) + or ( + cmd == "next1or2or3tag" + and x in (t[i + 1][1], t[i + 2][1], t[i + 3][1]) + ) + or (cmd == "surroundtag" and x == t[i - 1][1] and y == t[i + 1][1]) + or (cmd == "curwd" and x == t[i + 0][0]) + or (cmd == "prevwd" and x == t[i - 1][0]) + or (cmd == "nextwd" and x == t[i + 1][0]) + or (cmd == "prev1or2wd" and x in (t[i - 1][0], t[i - 2][0])) + or (cmd == "next1or2wd" and x in (t[i + 1][0], t[i + 2][0])) + or (cmd == "prevwdtag" and x == t[i - 1][0] and y == t[i - 1][1]) + or (cmd == "nextwdtag" and x == t[i + 1][0] and y == t[i + 1][1]) + or (cmd == "wdprevtag" and x == t[i - 1][1] and y == t[i + 0][0]) + or (cmd == "wdnexttag" and x == t[i + 0][0] and y == t[i + 1][1]) + or (cmd == "wdand2aft" and x == t[i + 0][0] and y == t[i + 2][0]) + or (cmd == "wdand2tagbfr" and x == t[i - 2][1] and y == t[i + 0][0]) + or (cmd == "wdand2tagaft" and x == t[i + 0][0] and y == t[i + 2][1]) + or (cmd == "lbigram" and x == t[i - 1][0] and y == t[i + 0][0]) + or (cmd == "rbigram" and x == t[i + 0][0] and y == t[i + 1][0]) + or (cmd == "prevbigram" and x == t[i - 2][1] and y == t[i - 1][1]) + or (cmd == "nextbigram" and x == t[i + 1][1] and y == t[i + 2][1]) + ): t[i] = [t[i][0], r[1]] - return t[len(o):-len(o)] + return t[len(o) : -len(o)] def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None): - """ Inserts a new rule that updates words with tag1 to tag2, - given constraints x and y, e.g., Context.append("TO < NN", "VB") + """Inserts a new rule that updates words with tag1 to tag2, + given constraints x and y, e.g., Context.append("TO < NN", "VB") """ if " < " in tag1 and not x and not y: - tag1, x = tag1.split(" < "); cmd="prevtag" + tag1, x = tag1.split(" < ") + cmd = "prevtag" if " > " in tag1 and not x and not y: - x, tag1 = tag1.split(" > "); cmd="nexttag" + x, tag1 = tag1.split(" > ") + cmd = "nexttag" lazylist.insert(self, i, [tag1, tag2, cmd, x or "", y or ""]) def append(self, *args, **kwargs): - self.insert(len(self)-1, *args, **kwargs) + self.insert(len(self) - 1, *args, **kwargs) - def extend(self, rules=[]): + def extend(self, rules=None): + if rules is None: + rules = [] for r in rules: self.append(*r) -#--- NAMED ENTITY RECOGNIZER ----------------------------------------------------------------------- -RE_ENTITY1 = re.compile(r"^http://") # http://www.domain.com/path -RE_ENTITY2 = re.compile(r"^www\..*?\.[com|org|net|edu|de|uk]$") # www.domain.com -RE_ENTITY3 = re.compile(r"^[\w\-\.\+]+@(\w[\w\-]+\.)+[\w\-]+$") # name@domain.com -class Entities(lazydict, Rules): +# --- NAMED ENTITY RECOGNIZER ----------------------------------------------------------------------- - def __init__(self, lexicon={}, path="", tag="NNP"): - """ A dictionary of named entities and their labels. - For domain names and e-mail adresses, regular expressions are used. +RE_ENTITY1 = re.compile(r"^http://") # http://www.domain.com/path +RE_ENTITY2 = re.compile(r"^www\..*?\.[com|org|net|edu|de|uk]$") # www.domain.com +RE_ENTITY3 = re.compile(r"^[\w\-\.\+]+@(\w[\w\-]+\.)+[\w\-]+$") # name@domain.com + + +class Entities(lazydict, Rules): + def __init__(self, lexicon=None, path="", tag="NNP"): + """A dictionary of named entities and their labels. + For domain names and e-mail adresses, regular expressions are used. """ + if lexicon is None: + lexicon = {} cmd = ( - "pers", # Persons: George/NNP-PERS - "loc", # Locations: Washington/NNP-LOC - "org", # Organizations: Google/NNP-ORG + "pers", # Persons: George/NNP-PERS + "loc", # Locations: Washington/NNP-LOC + "org", # Organizations: Google/NNP-ORG ) Rules.__init__(self, lexicon, cmd) self._path = path - self.tag = tag + self.tag = tag @property def path(self): @@ -618,37 +781,40 @@ def load(self): dict.setdefault(self, x[0], []).append(x) def apply(self, tokens): - """ Applies the named entity recognizer to the given list of tokens, - where each token is a [word, tag] list. + """Applies the named entity recognizer to the given list of tokens, + where each token is a [word, tag] list. """ # Note: we could also scan for patterns, e.g., # "my|his|her name is|was *" => NNP-PERS. i = 0 while i < len(tokens): w = tokens[i][0].lower() - if RE_ENTITY1.match(w) \ - or RE_ENTITY2.match(w) \ - or RE_ENTITY3.match(w): + if RE_ENTITY1.match(w) or RE_ENTITY2.match(w) or RE_ENTITY3.match(w): tokens[i][1] = self.tag if w in self: for e in self[w]: # Look ahead to see if successive words match the named entity. - e, tag = (e[:-1], "-"+e[-1].upper()) if e[-1] in self.cmd else (e, "") + e, tag = ( + (e[:-1], "-" + e[-1].upper()) if e[-1] in self.cmd else (e, "") + ) b = True for j, e in enumerate(e): - if i + j >= len(tokens) or tokens[i+j][0].lower() != e: - b = False; break + if i + j >= len(tokens) or tokens[i + j][0].lower() != e: + b = False + break if b: - for token in tokens[i:i+j+1]: - token[1] = (token[1] == "NNPS" and token[1] or self.tag) + tag + for token in tokens[i : i + j + 1]: + token[1] = ( + token[1] == "NNPS" and token[1] or self.tag + ) + tag i += j break i += 1 return tokens def append(self, entity, name="pers"): - """ Appends a named entity to the lexicon, - e.g., Entities.append("Hooloovoo", "PERS") + """Appends a named entity to the lexicon, + e.g., Entities.append("Hooloovoo", "PERS") """ e = [s.lower() for s in entity.split(" ") + [name]] self.setdefault(e[0], []).append(e) @@ -677,45 +843,48 @@ def extend(self, entities): # negative words + positive emoticons could indicate cynicism. # Semantic labels: -MOOD = "mood" # emoticons, emojis -IRONY = "irony" # sarcasm mark (!) +MOOD = "mood" # emoticons, emojis +IRONY = "irony" # sarcasm mark (!) -NOUN, VERB, ADJECTIVE, ADVERB = \ - "NN", "VB", "JJ", "RB" +NOUN, VERB, ADJECTIVE, ADVERB = "NN", "VB", "JJ", "RB" RE_SYNSET = re.compile(r"^[acdnrv][-_][0-9]+$") + def avg(list): return sum(list) / float(len(list) or 1) -class Score(tuple): - def __new__(self, polarity, subjectivity, assessments=[]): - """ A (polarity, subjectivity)-tuple with an assessments property. - """ +class Score(tuple): + def __new__(self, polarity, subjectivity, assessments=None): + """A (polarity, subjectivity)-tuple with an assessments property.""" + if assessments is None: + assessments = [] return tuple.__new__(self, [polarity, subjectivity]) - def __init__(self, polarity, subjectivity, assessments=[]): + def __init__(self, polarity, subjectivity, assessments=None): + if assessments is None: + assessments = [] self.assessments = assessments -class Sentiment(lazydict): +class Sentiment(lazydict): def __init__(self, path="", language=None, synset=None, confidence=None, **kwargs): - """ A dictionary of words (adjectives) and polarity scores (positive/negative). - The value for each word is a dictionary of part-of-speech tags. - The value for each word POS-tag is a tuple with values for - polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0). + """A dictionary of words (adjectives) and polarity scores (positive/negative). + The value for each word is a dictionary of part-of-speech tags. + The value for each word POS-tag is a tuple with values for + polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0). """ - self._path = path # XML file path. - self._language = None # XML language attribute ("en", "fr", ...) - self._confidence = None # XML confidence attribute threshold (>=). - self._synset = synset # XML synset attribute ("wordnet_id", "cornetto_id", ...) - self._synsets = {} # {"a-01123879": (1.0, 1.0, 1.0)} - self.labeler = {} # {"dammit": "profanity"} - self.tokenizer = kwargs.get("tokenizer", find_tokens) - self.negations = kwargs.get("negations", ("no", "not", "n't", "never")) - self.modifiers = kwargs.get("modifiers", ("RB",)) - self.modifier = kwargs.get("modifier" , lambda w: w.endswith("ly")) + self._path = path # XML file path. + self._language = None # XML language attribute ("en", "fr", ...) + self._confidence = None # XML confidence attribute threshold (>=). + self._synset = synset # XML synset attribute ("wordnet_id", "cornetto_id", ...) + self._synsets = {} # {"a-01123879": (1.0, 1.0, 1.0)} + self.labeler = {} # {"dammit": "profanity"} + self.tokenizer = kwargs.get("tokenizer", find_tokens) + self.negations = kwargs.get("negations", ("no", "not", "n't", "never")) + self.modifiers = kwargs.get("modifiers", ("RB",)) + self.modifier = kwargs.get("modifier", lambda w: w.endswith("ly")) @property def path(self): @@ -730,8 +899,8 @@ def confidence(self): return self._confidence def load(self, path=None): - """ Loads the XML-file (with sentiment annotations) from the given path. - By default, Sentiment.path is lazily loaded. + """Loads the XML-file (with sentiment annotations) from the given path. + By default, Sentiment.path is lazily loaded. """ # # @@ -743,8 +912,9 @@ def load(self, path=None): xml = cElementTree.parse(path) xml = xml.getroot() for w in xml.findall("word"): - if self._confidence is None \ - or self._confidence <= float(w.attrib.get("confidence", 0.0)): + if self._confidence is None or self._confidence <= float( + w.attrib.get("confidence", 0.0) + ): w, pos, p, s, i, label, synset = ( w.attrib.get("form"), w.attrib.get("pos"), @@ -752,7 +922,7 @@ def load(self, path=None): w.attrib.get("subjectivity", 0.0), w.attrib.get("intensity", 1.0), w.attrib.get("label"), - w.attrib.get(self._synset) # wordnet_id, cornetto_id, ... + w.attrib.get(self._synset), # wordnet_id, cornetto_id, ... ) psi = (float(p), float(s), float(i)) if w: @@ -764,7 +934,10 @@ def load(self, path=None): self._language = xml.attrib.get("language", self._language) # Average scores of all word senses per part-of-speech tag. for w in words: - words[w] = dict((pos, [avg(each) for each in zip(*psi)]) for pos, psi in words[w].items()) + words[w] = dict( + (pos, [avg(each) for each in zip(*psi)]) + for pos, psi in words[w].items() + ) # Average scores of all part-of-speech tags. for w, pos in list(words.items()): words[w][None] = [avg(each) for each in zip(*pos.values())] @@ -776,9 +949,9 @@ def load(self, path=None): dict.update(self._synsets, synsets) def synset(self, id, pos=ADJECTIVE): - """ Returns a (polarity, subjectivity)-tuple for the given synset id. - For example, the adjective "horrible" has id 193480 in WordNet: - Sentiment.synset(193480, pos="JJ") => (-0.6, 1.0, 1.0). + """Returns a (polarity, subjectivity)-tuple for the given synset id. + For example, the adjective "horrible" has id 193480 in WordNet: + Sentiment.synset(193480, pos="JJ") => (-0.6, 1.0, 1.0). """ id = str(id).zfill(8) if not id.startswith(("n-", "v-", "a-", "r-")): @@ -795,12 +968,13 @@ def synset(self, id, pos=ADJECTIVE): return tuple(self._synsets.get(id, (0.0, 0.0))[:2]) def __call__(self, s, negation=True, **kwargs): - """ Returns a (polarity, subjectivity)-tuple for the given sentence, - with polarity between -1.0 and 1.0 and subjectivity between 0.0 and 1.0. - The sentence can be a string, Synset, Text, Sentence, Chunk, Word, Document, Vector. - An optional weight parameter can be given, - as a function that takes a list of words and returns a weight. + """Returns a (polarity, subjectivity)-tuple for the given sentence, + with polarity between -1.0 and 1.0 and subjectivity between 0.0 and 1.0. + The sentence can be a string, Synset, Text, Sentence, Chunk, Word, Document, Vector. + An optional weight parameter can be given, + as a function that takes a list of words and returns a weight. """ + def avg(assessments, weighted=lambda w: 1): s, n = 0, 0 for words, score in assessments: @@ -808,6 +982,7 @@ def avg(assessments, weighted=lambda w: 1): s += w * score n += w return s / float(n or 1) + # A pattern.en.wordnet.Synset. # Sentiment(synsets("horrible", "JJ")[0]) => (-0.6, 1.0) if hasattr(s, "gloss"): @@ -815,19 +990,31 @@ def avg(assessments, weighted=lambda w: 1): # A synset id. # Sentiment("a-00193480") => horrible => (-0.6, 1.0) (English WordNet) # Sentiment("c_267") => verschrikkelijk => (-0.9, 1.0) (Dutch Cornetto) - elif isinstance(s, basestring) and RE_SYNSET.match(s) and hasattr(s, "synonyms"): + elif ( + isinstance(s, basestring) and RE_SYNSET.match(s) and hasattr(s, "synonyms") + ): a = [(s.synonyms[0],) + self.synset(s.id, pos=s.pos) + (None,)] # A string of words. # Sentiment("a horrible movie") => (-0.6, 1.0) elif isinstance(s, basestring): - a = self.assessments(((w.lower(), None) for w in " ".join(self.tokenizer(s)).split()), negation) + a = self.assessments( + ((w.lower(), None) for w in " ".join(self.tokenizer(s)).split()), + negation, + ) # A pattern.en.Text. elif hasattr(s, "sentences"): - a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) - for w in chain.from_iterable(s)), negation) + a = self.assessments( + ( + (w.lemma or w.string.lower(), w.pos[:2]) + for w in chain.from_iterable(s) + ), + negation, + ) # A pattern.en.Sentence or pattern.en.Chunk. elif hasattr(s, "lemmata"): - a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) for w in s.words), negation) + a = self.assessments( + ((w.lemma or w.string.lower(), w.pos[:2]) for w in s.words), negation + ) # A pattern.en.Word. elif hasattr(s, "lemma"): a = self.assessments(((s.lemma or s.string.lower(), s.pos[:2]),), negation) @@ -836,30 +1023,38 @@ def avg(assessments, weighted=lambda w: 1): # Bag-of words is unordered: inject None between each two words # to stop assessments() from scanning for preceding negation & modifiers. elif hasattr(s, "terms"): - a = self.assessments(chain.from_iterable(((w, None), (None, None)) for w in s), negation) + a = self.assessments( + chain.from_iterable(((w, None), (None, None)) for w in s), negation + ) kwargs.setdefault("weight", lambda w: s.terms[w[0]]) # A dict of (word, weight)-items. elif isinstance(s, dict): - a = self.assessments(chain.from_iterable(((w, None), (None, None)) for w in s), negation) + a = self.assessments( + chain.from_iterable(((w, None), (None, None)) for w in s), negation + ) kwargs.setdefault("weight", lambda w: s[w[0]]) # A list of words. elif isinstance(s, list): a = self.assessments(((w, None) for w in s), negation) else: a = [] - weight = kwargs.get("weight", lambda w: 1) # [(w, p) for w, p, s, x in a] - return Score(polarity = avg( [(w, p) for w, p, s, x in a], weight ), - subjectivity = avg([(w, s) for w, p, s, x in a], weight), - assessments = a) - - def assessments(self, words=[], negation=True): - """ Returns a list of (chunk, polarity, subjectivity, label)-tuples for the given list of words: - where chunk is a list of successive words: a known word optionally - preceded by a modifier ("very good") or a negation ("not good"). + weight = kwargs.get("weight", lambda w: 1) # [(w, p) for w, p, s, x in a] + return Score( + polarity=avg([(w, p) for w, p, s, x in a], weight), + subjectivity=avg([(w, s) for w, p, s, x in a], weight), + assessments=a, + ) + + def assessments(self, words=None, negation=True): + """Returns a list of (chunk, polarity, subjectivity, label)-tuples for the given list of words: + where chunk is a list of successive words: a known word optionally + preceded by a modifier ("very good") or a negation ("not good"). """ + if words is None: + words = [] a = [] - m = None # Preceding modifier (i.e., adverb or adjective). - n = None # Preceding negation (e.g., "not beautiful"). + m = None # Preceding modifier (i.e., adverb or adjective). + n = None # Preceding negation (e.g., "not beautiful"). for w, pos in words: # Only assess known words, preferably by part-of-speech tag. # Including unknown words (polarity 0.0 and subjectivity 0.0) lowers the average. @@ -886,7 +1081,11 @@ def assessments(self, words=[], negation=True): # Known word may be modifying the next word (i.e., it is a known adverb). m = None n = None - if pos and pos in self.modifiers or any(map(self[w].__contains__, self.modifiers)): + if ( + pos + and pos in self.modifiers + or any(map(self[w].__contains__, self.modifiers)) + ): m = (w, pos) if negation and w in self.negations: n = w @@ -898,7 +1097,11 @@ def assessments(self, words=[], negation=True): elif n and len(w.strip("'")) > 1: n = None # Unknown word may be a negation preceded by a modifier ("really not good"). - if n is not None and m is not None and (pos in self.modifiers or self.modifier(m[0])): + if ( + n is not None + and m is not None + and (pos in self.modifiers or self.modifier(m[0])) + ): a[-1]["w"].append(n) a[-1]["n"] = -1 n = None @@ -913,9 +1116,11 @@ def assessments(self, words=[], negation=True): if w == "(!)": a.append(dict(w=[w], p=0.0, s=1.0, i=1.0, n=1, x=IRONY)) # EMOTICONS: {("grin", +1.0): set((":-D", ":D"))} - if w.isalpha() is False and len(w) <= 5 and w not in PUNCTUATION: # speedup - for (type, p), e in EMOTICONS.items(): - if w in imap(lambda e: e.lower(), e): + if ( + w.isalpha() is False and len(w) <= 5 and w not in PUNCTUATION + ): # speedup + for (_type, p), e in EMOTICONS.items(): + if w in map(lambda e: e.lower(), e): a.append(dict(w=[w], p=p, s=1.0, i=1.0, n=1, x=MOOD)) break for i in range(len(a)): @@ -928,23 +1133,26 @@ def assessments(self, words=[], negation=True): a[i] = (w, p * -0.5 if n < 0 else p, s, x) return a - def annotate(self, word, pos=None, polarity=0.0, subjectivity=0.0, intensity=1.0, label=None): - """ Annotates the given word with polarity, subjectivity and intensity scores, - and optionally a semantic label (e.g., MOOD for emoticons, IRONY for "(!)"). + def annotate( + self, word, pos=None, polarity=0.0, subjectivity=0.0, intensity=1.0, label=None + ): + """Annotates the given word with polarity, subjectivity and intensity scores, + and optionally a semantic label (e.g., MOOD for emoticons, IRONY for "(!)"). """ w = self.setdefault(word, {}) w[pos] = w[None] = (polarity, subjectivity, intensity) if label: self.labeler[word] = label -#--- PART-OF-SPEECH TAGGER ------------------------------------------------------------------------- + +# --- PART-OF-SPEECH TAGGER ------------------------------------------------------------------------- # Unknown words are recognized as numbers if they contain only digits and -,.:/%$ CD = re.compile(r"^[0-9\-\,\.\:\/\%\$]+$") + def _suffix_rules(token, tag="NN"): - """ Default morphological tagging rules for English, based on word suffixes. - """ + """Default morphological tagging rules for English, based on word suffixes.""" if isinstance(token, (list, tuple)): token, tag = token if token.endswith("ing"): @@ -953,7 +1161,12 @@ def _suffix_rules(token, tag="NN"): tag = "RB" if token.endswith("s") and not token.endswith(("is", "ous", "ss")): tag = "NNS" - if token.endswith(("able", "al", "ful", "ible", "ient", "ish", "ive", "less", "tic", "ous")) or "-" in token: + if ( + token.endswith( + ("able", "al", "ful", "ible", "ient", "ish", "ive", "less", "tic", "ous") + ) + or "-" in token + ): tag = "JJ" if token.endswith("ed"): tag = "VBN" @@ -961,29 +1174,45 @@ def _suffix_rules(token, tag="NN"): tag = "VBP" return [token, tag] -def find_tags(tokens, lexicon={}, model=None, morphology=None, context=None, entities=None, default=("NN", "NNP", "CD"), language="en", map=None, **kwargs): - """ Returns a list of [token, tag]-items for the given list of tokens: - ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] - Words are tagged using the given lexicon of (word, tag)-items. - Unknown words are tagged NN by default. - Unknown words that start with a capital letter are tagged NNP (unless language="de"). - Unknown words that consist only of digits and punctuation marks are tagged CD. - Unknown words are then improved with morphological rules. - All words are improved with contextual rules. - If a model is given, uses model for unknown words instead of morphology and context. - If map is a function, it is applied to each (token, tag) after applying all rules. + +def find_tags( + tokens, + lexicon=None, + model=None, + morphology=None, + context=None, + entities=None, + default=("NN", "NNP", "CD"), + language="en", + map=None, + **kwargs, +): + """Returns a list of [token, tag]-items for the given list of tokens: + ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] + Words are tagged using the given lexicon of (word, tag)-items. + Unknown words are tagged NN by default. + Unknown words that start with a capital letter are tagged NNP (unless language="de"). + Unknown words that consist only of digits and punctuation marks are tagged CD. + Unknown words are then improved with morphological rules. + All words are improved with contextual rules. + If a model is given, uses model for unknown words instead of morphology and context. + If map is a function, it is applied to each (token, tag) after applying all rules. """ + if lexicon is None: + lexicon = {} tagged = [] # Tag known words. for i, token in enumerate(tokens): - tagged.append([token, lexicon.get(token, i == 0 and lexicon.get(token.lower()) or None)]) + tagged.append( + [token, lexicon.get(token, i == 0 and lexicon.get(token.lower()) or None)] + ) # Tag unknown words. for i, (token, tag) in enumerate(tagged): prev, next = (None, None), (None, None) if i > 0: - prev = tagged[i-1] + prev = tagged[i - 1] if i < len(tagged) - 1: - next = tagged[i+1] + next = tagged[i + 1] if tag is None or token in (model is not None and model.unknown or ()): # Use language model (i.e., SLP). if model is not None: @@ -1014,7 +1243,8 @@ def find_tags(tokens, lexicon={}, model=None, morphology=None, context=None, ent tagged = [list(map(token, tag)) or [token, default[0]] for token, tag in tagged] return tagged -#--- PHRASE CHUNKER -------------------------------------------------------------------------------- + +# --- PHRASE CHUNKER -------------------------------------------------------------------------------- SEPARATOR = "/" @@ -1026,46 +1256,82 @@ def find_tags(tokens, lexicon={}, model=None, morphology=None, context=None, ent # Chunking rules. # CHUNKS[0] = Germanic: RB + JJ precedes NN ("the round table"). # CHUNKS[1] = Romance: RB + JJ precedes or follows NN ("la table ronde", "une jolie fille"). -CHUNKS = [[ - # Germanic languages: en, de, nl, ... - ( "NP", re.compile(r"(("+NN+")/)*((DT|CD|CC|CJ)/)*(("+RB+"|"+JJ+")/)*(("+NN+")/)+")), - ( "VP", re.compile(r"(((MD|"+RB+")/)*(("+VB+")/)+)+")), - ( "VP", re.compile(r"((MD)/)")), - ( "PP", re.compile(r"((IN|PP|TO)/)+")), - ("ADJP", re.compile(r"((CC|CJ|"+RB+"|"+JJ+")/)*(("+JJ+")/)+")), - ("ADVP", re.compile(r"(("+RB+"|WRB)/)+")), -], [ - # Romance languages: es, fr, it, ... - ( "NP", re.compile(r"(("+NN+")/)*((DT|CD|CC|CJ)/)*(("+RB+"|"+JJ+")/)*(("+NN+")/)+(("+RB+"|"+JJ+")/)*")), - ( "VP", re.compile(r"(((MD|"+RB+")/)*(("+VB+")/)+(("+RB+")/)*)+")), - ( "VP", re.compile(r"((MD)/)")), - ( "PP", re.compile(r"((IN|PP|TO)/)+")), - ("ADJP", re.compile(r"((CC|CJ|"+RB+"|"+JJ+")/)*(("+JJ+")/)+")), - ("ADVP", re.compile(r"(("+RB+"|WRB)/)+")), -]] +CHUNKS = [ + [ + # Germanic languages: en, de, nl, ... + ( + "NP", + re.compile( + r"((" + + NN + + ")/)*((DT|CD|CC|CJ)/)*((" + + RB + + "|" + + JJ + + ")/)*((" + + NN + + ")/)+" + ), + ), + ("VP", re.compile(r"(((MD|" + RB + ")/)*((" + VB + ")/)+)+")), + ("VP", re.compile(r"((MD)/)")), + ("PP", re.compile(r"((IN|PP|TO)/)+")), + ("ADJP", re.compile(r"((CC|CJ|" + RB + "|" + JJ + ")/)*((" + JJ + ")/)+")), + ("ADVP", re.compile(r"((" + RB + "|WRB)/)+")), + ], + [ + # Romance languages: es, fr, it, ... + ( + "NP", + re.compile( + r"((" + + NN + + ")/)*((DT|CD|CC|CJ)/)*((" + + RB + + "|" + + JJ + + ")/)*((" + + NN + + ")/)+((" + + RB + + "|" + + JJ + + ")/)*" + ), + ), + ("VP", re.compile(r"(((MD|" + RB + ")/)*((" + VB + ")/)+((" + RB + ")/)*)+")), + ("VP", re.compile(r"((MD)/)")), + ("PP", re.compile(r"((IN|PP|TO)/)+")), + ("ADJP", re.compile(r"((CC|CJ|" + RB + "|" + JJ + ")/)*((" + JJ + ")/)+")), + ("ADVP", re.compile(r"((" + RB + "|WRB)/)+")), + ], +] # Handle ADJP before VP, so that # RB prefers next ADJP over previous VP. CHUNKS[0].insert(1, CHUNKS[0].pop(3)) CHUNKS[1].insert(1, CHUNKS[1].pop(3)) + def find_chunks(tagged, language="en"): - """ The input is a list of [token, tag]-items. - The output is a list of [token, tag, chunk]-items: - The/DT nice/JJ fish/NN is/VBZ dead/JJ ./. => - The/DT/B-NP nice/JJ/I-NP fish/NN/I-NP is/VBZ/B-VP dead/JJ/B-ADJP ././O + """The input is a list of [token, tag]-items. + The output is a list of [token, tag, chunk]-items: + The/DT nice/JJ fish/NN is/VBZ dead/JJ ./. => + The/DT/B-NP nice/JJ/I-NP fish/NN/I-NP is/VBZ/B-VP dead/JJ/B-ADJP ././O """ chunked = [x for x in tagged] - tags = "".join("%s%s" % (tag, SEPARATOR) for token, tag in tagged) + tags = "".join(f"{tag}{SEPARATOR}" for token, tag in tagged) # Use Germanic or Romance chunking rules according to given language. - for tag, rule in CHUNKS[int(language in ("ca", "es", "pt", "fr", "it", "pt", "ro"))]: + for tag, rule in CHUNKS[ + int(language in ("ca", "es", "pt", "fr", "it", "pt", "ro")) + ]: for m in rule.finditer(tags): # Find the start of chunks inside the tags-string. # Number of preceding separators = number of preceding tokens. i = m.start() j = tags[:i].count(SEPARATOR) n = m.group(0).count(SEPARATOR) - for k in range(j, j+n): + for k in range(j, j + n): if len(chunked[k]) == 3: continue if len(chunked[k]) < 3: @@ -1074,26 +1340,27 @@ def find_chunks(tagged, language="en"): j += 1 # Mark first token in chunk with B-. elif k == j: - chunked[k].append("B-"+tag) + chunked[k].append("B-" + tag) # Mark other tokens in chunk with I-. else: - chunked[k].append("I-"+tag) + chunked[k].append("I-" + tag) # Mark chinks (tokens outside of a chunk) with O-. for chink in filter(lambda x: len(x) < 3, chunked): chink.append("O") # Post-processing corrections. - for i, (word, tag, chunk) in enumerate(chunked): + for i, (_word, tag, chunk) in enumerate(chunked): if tag.startswith("RB") and chunk == "B-NP": # "Very nice work" (NP) <=> "Perhaps" (ADVP) + "you" (NP). - if i < len(chunked)-1 and not chunked[i+1][1].startswith("JJ"): - chunked[i+0][2] = "B-ADVP" - chunked[i+1][2] = "B-NP" + if i < len(chunked) - 1 and not chunked[i + 1][1].startswith("JJ"): + chunked[i + 0][2] = "B-ADVP" + chunked[i + 1][2] = "B-NP" return chunked + def find_prepositions(chunked): - """ The input is a list of [token, tag, chunk]-items. - The output is a list of [token, tag, chunk, preposition]-items. - PP-chunks followed by NP-chunks make up a PNP-chunk. + """The input is a list of [token, tag, chunk]-items. + The output is a list of [token, tag, chunk, preposition]-items. + PP-chunks followed by NP-chunks make up a PNP-chunk. """ # Tokens that are not part of a preposition just get the O-tag. for ch in chunked: @@ -1101,12 +1368,13 @@ def find_prepositions(chunked): for i, chunk in enumerate(chunked): if chunk[2].endswith("PP") and chunk[-1] == "O": # Find PP followed by other PP, NP with nouns and pronouns, VP with a gerund. - if i < len(chunked)-1 and \ - (chunked[i+1][2].endswith(("NP", "PP")) or \ - chunked[i+1][1] in ("VBG", "VBN")): + if i < len(chunked) - 1 and ( + chunked[i + 1][2].endswith(("NP", "PP")) + or chunked[i + 1][1] in ("VBG", "VBN") + ): chunk[-1] = "B-PNP" pp = True - for ch in chunked[i+1:]: + for ch in chunked[i + 1 :]: if not (ch[2].endswith(("NP", "PP")) or ch[1] in ("VBG", "VBN")): break if ch[2].endswith("PP") and pp: @@ -1116,9 +1384,10 @@ def find_prepositions(chunked): pp = False return chunked + #### PARSER ######################################################################################## -#--- PARSER ---------------------------------------------------------------------------------------- +# --- PARSER ---------------------------------------------------------------------------------------- # A shallow parser can be used to retrieve syntactic-semantic information from text # in an efficient way (usually at the expense of deeper configurational syntactic information). # The shallow parser in Pattern is meant to handle the following tasks: @@ -1129,7 +1398,7 @@ def find_prepositions(chunked): # 5) Lemmatization: find the base form of each word ("was" => "is"). # WORD TAG CHUNK PNP ROLE LEMMA -#------------------------------------------------------------------ +# ------------------------------------------------------------------ # The DT B-NP O NP-SBJ-1 the # black JJ I-NP O NP-SBJ-1 black # cat NN I-NP O NP-SBJ-1 cat @@ -1151,78 +1420,91 @@ def find_prepositions(chunked): # http://www.clips.ua.ac.be/pages/penn-treebank-tagset PTB = PENN = "penn" -class Parser: - def __init__(self, lexicon={}, default=("NN", "NNP", "CD"), language=None): - """ A simple shallow parser using a Brill-based part-of-speech tagger. - The given lexicon is a dictionary of known words and their part-of-speech tag. - The given default tags are used for unknown words. - Unknown words that start with a capital letter are tagged NNP (except for German). - Unknown words that contain only digits and punctuation are tagged CD. - The given language can be used to discern between - Germanic and Romance languages for phrase chunking. +class Parser: + def __init__(self, lexicon=None, default=("NN", "NNP", "CD"), language=None): + """A simple shallow parser using a Brill-based part-of-speech tagger. + The given lexicon is a dictionary of known words and their part-of-speech tag. + The given default tags are used for unknown words. + Unknown words that start with a capital letter are tagged NNP (except for German). + Unknown words that contain only digits and punctuation are tagged CD. + The given language can be used to discern between + Germanic and Romance languages for phrase chunking. """ - self.lexicon = lexicon - self.default = default + if lexicon is None: + lexicon = {} + self.lexicon = lexicon + self.default = default self.language = language def find_tokens(self, string, **kwargs): - """ Returns a list of sentences from the given string. - Punctuation marks are separated from each word by a space. + """Returns a list of sentences from the given string. + Punctuation marks are separated from each word by a space. """ # "The cat purs." => ["The cat purs ."] - return find_tokens(text_type(string), - punctuation = kwargs.get( "punctuation", PUNCTUATION), - abbreviations = kwargs.get("abbreviations", ABBREVIATIONS), - replace = kwargs.get( "replace", replacements), - linebreak = r"\n{2,}") + return find_tokens( + str(string), + punctuation=kwargs.get("punctuation", PUNCTUATION), + abbreviations=kwargs.get("abbreviations", ABBREVIATIONS), + replace=kwargs.get("replace", replacements), + linebreak=r"\n{2,}", + ) def find_tags(self, tokens, **kwargs): - """ Annotates the given list of tokens with part-of-speech tags. - Returns a list of tokens, where each token is now a [word, tag]-list. + """Annotates the given list of tokens with part-of-speech tags. + Returns a list of tokens, where each token is now a [word, tag]-list. """ # ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] - return find_tags(tokens, - language = kwargs.get("language", self.language), - lexicon = kwargs.get( "lexicon", self.lexicon), - default = kwargs.get( "default", self.default), - map = kwargs.get( "map", None)) + return find_tags( + tokens, + language=kwargs.get("language", self.language), + lexicon=kwargs.get("lexicon", self.lexicon), + default=kwargs.get("default", self.default), + map=kwargs.get("map", None), + ) def find_chunks(self, tokens, **kwargs): - """ Annotates the given list of tokens with chunk tags. - Several tags can be added, for example chunk + preposition tags. + """Annotates the given list of tokens with chunk tags. + Several tags can be added, for example chunk + preposition tags. """ # [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] => # [["The", "DT", "B-NP"], ["cat", "NN", "I-NP"], ["purs", "VB", "B-VP"]] return find_prepositions( - find_chunks(tokens, - language = kwargs.get("language", self.language))) + find_chunks(tokens, language=kwargs.get("language", self.language)) + ) def find_prepositions(self, tokens, **kwargs): - """ Annotates the given list of tokens with prepositional noun phrase tags. - """ - return find_prepositions(tokens) # See also Parser.find_chunks(). + """Annotates the given list of tokens with prepositional noun phrase tags.""" + return find_prepositions(tokens) # See also Parser.find_chunks(). def find_labels(self, tokens, **kwargs): - """ Annotates the given list of tokens with verb/predicate tags. - """ + """Annotates the given list of tokens with verb/predicate tags.""" return find_relations(tokens) def find_lemmata(self, tokens, **kwargs): - """ Annotates the given list of tokens with word lemmata. - """ + """Annotates the given list of tokens with word lemmata.""" return [token + [token[0].lower()] for token in tokens] - def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs): - """ Takes a string (sentences) and returns a tagged Unicode string (TaggedString). - Sentences in the output are separated by newlines. - With tokenize=True, punctuation is split from words and sentences are separated by \n. - With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...). - With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...). - With relations=True, semantic role labels are parsed (SBJ, OBJ). - With lemmata=True, word lemmata are parsed. - Optional parameters are passed to - the tokenizer, tagger, chunker, labeler and lemmatizer. + def parse( + self, + s, + tokenize=True, + tags=True, + chunks=True, + relations=False, + lemmata=False, + encoding="utf-8", + **kwargs, + ): + """Takes a string (sentences) and returns a tagged Unicode string (TaggedString). + Sentences in the output are separated by newlines. + With tokenize=True, punctuation is split from words and sentences are separated by \n. + With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...). + With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...). + With relations=True, semantic role labels are parsed (SBJ, OBJ). + With lemmata=True, word lemmata are parsed. + Optional parameters are passed to + the tokenizer, tagger, chunker, labeler and lemmatizer. """ # Tokenizer. if tokenize: @@ -1234,7 +1516,7 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma # Unicode. for i in range(len(s)): for j in range(len(s[i])): - if isinstance(s[i][j], binary_type): + if isinstance(s[i][j], bytes): s[i][j] = decode_string(s[i][j], encoding) # Tagger (required by chunker, labeler & lemmatizer). if tags or chunks or relations or lemmata: @@ -1253,8 +1535,7 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma # Slash-formatted tagged string. # With collapse=False (or split=True), returns raw list # (this output is not usable by tree.Text). - if not kwargs.get("collapse", True) \ - or kwargs.get("split", False): + if not kwargs.get("collapse", True) or kwargs.get("split", False): return s # Construct TaggedString.format. # (this output is usable by tree.Text). @@ -1276,52 +1557,64 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma s[i][j] = "/".join(s[i][j]) s[i] = " ".join(s[i]) s = "\n".join(s) - s = TaggedString(unicode(s), format, language=kwargs.get("language", self.language)) + s = TaggedString( + str(s), format, language=kwargs.get("language", self.language) + ) return s -#--- TAGGED STRING --------------------------------------------------------------------------------- +# --- TAGGED STRING --------------------------------------------------------------------------------- # Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes. # The pattern.text.tree.Text class uses this attribute to determine the token format and # transform the tagged string to a parse tree of nested Sentence, Chunk and Word objects. TOKENS = "tokens" -class TaggedString(unicode): - def __new__(self, string, tags=["word"], language=None): - """ Unicode string with tags and language attributes. - For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]). +class TaggedString(str): + def __new__(self, string, tags=None, language=None): + """Unicode string with tags and language attributes. + For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]). """ # From a TaggedString: - if isinstance(string, unicode) and hasattr(string, "tags"): + if tags is None: + tags = ["word"] + if isinstance(string, str) and hasattr(string, "tags"): tags, language = string.tags, string.language # From a TaggedString.split(TOKENS) list: if isinstance(string, list): - string = [[[x.replace("/", "&slash;") for x in token] for token in s] for s in string] + string = [ + [[x.replace("/", "&slash;") for x in token] for token in s] + for s in string + ] string = "\n".join(" ".join("/".join(token) for token in s) for s in string) - s = unicode.__new__(self, string) + s = str.__new__(self, string) s.tags = list(tags) s.language = language return s def split(self, sep=TOKENS): - """ Returns a list of sentences, where each sentence is a list of tokens, - where each token is a list of word + tags. + """Returns a list of sentences, where each sentence is a list of tokens, + where each token is a list of word + tags. """ if sep != TOKENS: - return unicode.split(self, sep) + return str.split(self, sep) if len(self) == 0: return [] - return [[[x.replace("&slash;", "/") for x in token.split("/")] - for token in sentence.split(" ")] - for sentence in unicode.split(self, "\n")] + return [ + [ + [x.replace("&slash;", "/") for x in token.split("/")] + for token in sentence.split(" ") + ] + for sentence in str.split(self, "\n") + ] + #### SPELLING CORRECTION ########################################################################### # Based on: Peter Norvig, "How to Write a Spelling Corrector", http://norvig.com/spell-correct.html -class Spelling(lazydict): +class Spelling(lazydict): ALPHA = "abcdefghijklmnopqrstuvwxyz" def __init__(self, path=""): @@ -1342,21 +1635,20 @@ def language(self): @classmethod def train(self, s, path="spelling.txt"): - """ Counts the words in the given string and saves the probabilities at the given path. - This can be used to generate a new model for the Spelling() constructor. + """Counts the words in the given string and saves the probabilities at the given path. + This can be used to generate a new model for the Spelling() constructor. """ model = {} for w in re.findall("[a-z]+", s.lower()): model[w] = w in model and model[w] + 1 or 1 - model = ("%s %s" % (k, v) for k, v in sorted(model.items())) + model = (f"{k} {v}" for k, v in sorted(model.items())) model = "\n".join(model) f = open(path, "w") f.write(model) f.close() def _edit1(self, w): - """ Returns a set of words with edit distance 1 from the given word. - """ + """Returns a set of words with edit distance 1 from the given word.""" # Of all spelling errors, 80% is covered by edit distance 1. # Edit distance 1 = one character deleted, swapped, replaced or inserted. split = [(w[:i], w[i:]) for i in range(len(w) + 1)] @@ -1364,40 +1656,42 @@ def _edit1(self, w): [a + b[1:] for a, b in split if b], [a + b[1] + b[0] + b[2:] for a, b in split if len(b) > 1], [a + c + b[1:] for a, b in split for c in Spelling.ALPHA if b], - [a + c + b[0:] for a, b in split for c in Spelling.ALPHA] + [a + c + b[0:] for a, b in split for c in Spelling.ALPHA], ) return set(delete + transpose + replace + insert) def _edit2(self, w): - """ Returns a set of words with edit distance 2 from the given word - """ + """Returns a set of words with edit distance 2 from the given word""" # Of all spelling errors, 99% is covered by edit distance 2. # Only keep candidates that are actually known words (20% speedup). return set(e2 for e1 in self._edit1(w) for e2 in self._edit1(e1) if e2 in self) - def _known(self, words=[]): - """ Returns the given list of words filtered by known words. - """ + def _known(self, words=None): + """Returns the given list of words filtered by known words.""" + if words is None: + words = [] return set(w for w in words if w in self) def suggest(self, w): - """ Return a list of (word, confidence) spelling corrections for the given word, - based on the probability of known words with edit distance 1-2 from the given word. + """Return a list of (word, confidence) spelling corrections for the given word, + based on the probability of known words with edit distance 1-2 from the given word. """ if len(self) == 0: self.load() if len(w) == 1: - return [(w, 1.0)] # I + return [(w, 1.0)] # I if w in PUNCTUATION: - return [(w, 1.0)] # .?! + return [(w, 1.0)] # .?! if w in string.whitespace: - return [(w, 1.0)] # \n + return [(w, 1.0)] # \n if w.replace(".", "").isdigit(): - return [(w, 1.0)] # 1.5 - candidates = self._known([w]) \ - or self._known(self._edit1(w)) \ - or self._known(self._edit2(w)) \ - or [w] + return [(w, 1.0)] # 1.5 + candidates = ( + self._known([w]) + or self._known(self._edit1(w)) + or self._known(self._edit2(w)) + or [w] + ) candidates = [(self.get(c, 0.0), c) for c in candidates] s = float(sum(p for p, word in candidates) or 1) candidates = sorted(((p / s, word) for p, word in candidates), reverse=True) diff --git a/textblob/base.py b/src/textblob/base.py similarity index 86% rename from textblob/base.py rename to src/textblob/base.py index eaeca61f..2690d3f2 100644 --- a/textblob/base.py +++ b/src/textblob/base.py @@ -1,24 +1,22 @@ -# -*- coding: utf-8 -*- """Abstract base classes for models (taggers, noun phrase extractors, etc.) which define the interface for descendant classes. .. versionchanged:: 0.7.0 All base classes are defined in the same module, ``textblob.base``. """ -from __future__ import absolute_import from abc import ABCMeta, abstractmethod import nltk -from textblob.compat import with_metaclass - ##### POS TAGGERS ##### -class BaseTagger(with_metaclass(ABCMeta)): + +class BaseTagger(metaclass=ABCMeta): """Abstract tagger class from which all taggers inherit from. All descendants must implement a ``tag()`` method. """ + @abstractmethod def tag(self, text, tokenize=True): """Return a list of tuples of the form (word, tag) @@ -26,9 +24,11 @@ def tag(self, text, tokenize=True): """ return + ##### NOUN PHRASE EXTRACTORS ##### -class BaseNPExtractor(with_metaclass(ABCMeta)): + +class BaseNPExtractor(metaclass=ABCMeta): """Abstract base class from which all NPExtractor classes inherit. Descendant classes must implement an ``extract(text)`` method that returns a list of noun phrases as strings. @@ -39,13 +39,16 @@ def extract(self, text): """Return a list of noun phrases (strings) for a body of text.""" return + ##### TOKENIZERS ##### -class BaseTokenizer(with_metaclass(ABCMeta), nltk.tokenize.api.TokenizerI): + +class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta): """Abstract base class from which all Tokenizer classes inherit. Descendant classes must implement a ``tokenize(text)`` method that returns a list of noun phrases as strings. """ + @abstractmethod def tokenize(self, text): """Return a list of tokens (strings) for a body of text. @@ -63,18 +66,20 @@ def itokenize(self, text, *args, **kwargs): """ return (t for t in self.tokenize(text, *args, **kwargs)) + ##### SENTIMENT ANALYZERS #### -DISCRETE = 'ds' -CONTINUOUS = 'co' +DISCRETE = "ds" +CONTINUOUS = "co" -class BaseSentimentAnalyzer(with_metaclass(ABCMeta)): +class BaseSentimentAnalyzer(metaclass=ABCMeta): """Abstract base class from which all sentiment analyzers inherit. Should implement an ``analyze(text)`` method which returns either the results of analysis. """ + kind = DISCRETE def __init__(self): @@ -95,12 +100,15 @@ def analyze(self, text): # Analyze text return None + ##### PARSERS ##### -class BaseParser(with_metaclass(ABCMeta)): + +class BaseParser(metaclass=ABCMeta): """Abstract parser class from which all parsers inherit from. All descendants must implement a ``parse()`` method. """ + @abstractmethod def parse(self, text): """Parses the text.""" diff --git a/textblob/blob.py b/src/textblob/blob.py similarity index 69% rename from textblob/blob.py rename to src/textblob/blob.py index f53db1a7..4b2b3a77 100644 --- a/textblob/blob.py +++ b/src/textblob/blob.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Wrappers for various units of text, including the main :class:`TextBlob `, :class:`Word `, and :class:`WordList ` classes. @@ -19,34 +18,39 @@ .. versionchanged:: 0.8.0 These classes are now imported from ``textblob`` rather than ``text.blob``. -""" -from __future__ import unicode_literals, absolute_import -import sys +""" # noqa: E501 import json -import warnings +import sys from collections import defaultdict import nltk +from textblob.base import ( + BaseNPExtractor, + BaseParser, + BaseSentimentAnalyzer, + BaseTagger, + BaseTokenizer, +) from textblob.decorators import cached_property, requires_nltk_corpus -from textblob.utils import lowerstrip, PUNCTUATION_REGEX -from textblob.inflect import singularize as _singularize, pluralize as _pluralize +from textblob.en import suggest +from textblob.inflect import pluralize as _pluralize +from textblob.inflect import singularize as _singularize from textblob.mixins import BlobComparableMixin, StringlikeMixin -from textblob.compat import unicode, basestring -from textblob.base import (BaseNPExtractor, BaseTagger, BaseTokenizer, - BaseSentimentAnalyzer, BaseParser) from textblob.np_extractors import FastNPExtractor +from textblob.parsers import PatternParser +from textblob.sentiments import PatternAnalyzer from textblob.taggers import NLTKTagger from textblob.tokenizers import WordTokenizer, sent_tokenize, word_tokenize -from textblob.sentiments import PatternAnalyzer -from textblob.parsers import PatternParser -from textblob.translate import Translator -from textblob.en import suggest +from textblob.utils import PUNCTUATION_REGEX, lowerstrip # Wordnet interface # NOTE: textblob.wordnet is not imported so that the wordnet corpus can be lazy-loaded _wordnet = nltk.corpus.wordnet +basestring = (str, bytes) + + def _penn_to_wordnet(tag): """Converts a Penn corpus tag into a Wordnet tag.""" if tag in ("NN", "NNS", "NNP", "NNPS"): @@ -59,20 +63,19 @@ def _penn_to_wordnet(tag): return _wordnet.ADV return None -class Word(unicode): + +class Word(str): """A simple word representation. Includes methods for inflection, translation, and WordNet integration. """ - translator = Translator() - def __new__(cls, string, pos_tag=None): """Return a new instance of the class. It is necessary to override this method in order to handle the extra pos_tag argument in the constructor. """ - return super(Word, cls).__new__(cls, string) + return super().__new__(cls, string) def __init__(self, string, pos_tag=None): self.string = string @@ -89,63 +92,32 @@ def singularize(self): return Word(_singularize(self.string)) def pluralize(self): - '''Return the plural version of the word as a string.''' + """Return the plural version of the word as a string.""" return Word(_pluralize(self.string)) - def translate(self, from_lang='auto', to="en"): - '''Translate the word to another language using Google's - Translate API. - - .. deprecated:: 0.16.0 - Use the official Google Translate API instead. - .. versionadded:: 0.5.0 - ''' - warnings.warn( - 'Word.translate is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.translator.translate(self.string, - from_lang=from_lang, to_lang=to) - - def detect_language(self): - '''Detect the word's language using Google's Translate API. - - .. deprecated:: 0.16.0 - Use the official Google Translate API istead. - .. versionadded:: 0.5.0 - ''' - warnings.warn( - 'Word.detect_language is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.translator.detect(self.string) - def spellcheck(self): - '''Return a list of (word, confidence) tuples of spelling corrections. + """Return a list of (word, confidence) tuples of spelling corrections. Based on: Peter Norvig, "How to Write a Spelling Corrector" (http://norvig.com/spell-correct.html) as implemented in the pattern library. .. versionadded:: 0.6.0 - ''' + """ return suggest(self.string) def correct(self): - '''Correct the spelling of the word. Returns the word with the highest + """Correct the spelling of the word. Returns the word with the highest confidence using the spelling corrector. .. versionadded:: 0.6.0 - ''' + """ return Word(self.spellcheck()[0][0]) @cached_property @requires_nltk_corpus def lemma(self): - """Return the lemma of this word using Wordnet's morphy function. - """ + """Return the lemma of this word using Wordnet's morphy function.""" return self.lemmatize(pos=self.pos_tag) @requires_nltk_corpus @@ -170,8 +142,8 @@ def lemmatize(self, pos=None): LancasterStemmer = nltk.stem.lancaster.LancasterStemmer() SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english") - #added 'stemmer' on lines of lemmatizer - #based on nltk + # added 'stemmer' on lines of lemmatizer + # based on nltk def stem(self, stemmer=PorterStemmer): """Stem a word using various NLTK stemmers. (Default: Porter Stemmer) @@ -230,20 +202,20 @@ def __init__(self, collection): """Initialize a WordList. Takes a collection of strings as its only argument. """ - super(WordList, self).__init__([Word(w) for w in collection]) + super().__init__([Word(w) for w in collection]) def __str__(self): """Returns a string representation for printing.""" - return super(WordList, self).__repr__() + return super().__repr__() def __repr__(self): """Returns a string representation for debugging.""" class_name = self.__class__.__name__ - return '{cls}({lst})'.format(cls=class_name, lst=super(WordList, self).__repr__()) + return f"{class_name}({super().__repr__()})" def __getitem__(self, key): """Returns a string at the given index.""" - item = super(WordList, self).__getitem__(key) + item = super().__getitem__(key) if isinstance(key, slice): return self.__class__(item) else: @@ -251,16 +223,16 @@ def __getitem__(self, key): def __getslice__(self, i, j): # This is included for Python 2.* compatibility - return self.__class__(super(WordList, self).__getslice__(i, j)) + return self.__class__(super().__getslice__(i, j)) def __setitem__(self, index, obj): """Places object at given index, replacing existing item. If the object is a string, inserts a :class:`Word ` object. """ if isinstance(obj, basestring): - super(WordList, self).__setitem__(index, Word(obj)) + super().__setitem__(index, Word(obj)) else: - super(WordList, self).__setitem__(index, obj) + super().__setitem__(index, obj) def count(self, strg, case_sensitive=False, *args, **kwargs): """Get the count of a word or phrase `s` within this WordList. @@ -269,18 +241,17 @@ def count(self, strg, case_sensitive=False, *args, **kwargs): :param case_sensitive: A boolean, whether or not the search is case-sensitive. """ if not case_sensitive: - return [word.lower() for word in self].count(strg.lower(), *args, - **kwargs) - return super(WordList, self).count(strg, *args, **kwargs) + return [word.lower() for word in self].count(strg.lower(), *args, **kwargs) + return super().count(strg, *args, **kwargs) def append(self, obj): """Append an object to end. If the object is a string, appends a :class:`Word ` object. """ if isinstance(obj, basestring): - super(WordList, self).append(Word(obj)) + super().append(Word(obj)) else: - super(WordList, self).append(obj) + super().append(obj) def extend(self, iterable): """Extend WordList by appending elements from ``iterable``. If an element @@ -325,26 +296,34 @@ def _validated_param(obj, name, base_class, default, base_class_name=None): """ base_class_name = base_class_name if base_class_name else base_class.__name__ if obj is not None and not isinstance(obj, base_class): - raise ValueError('{name} must be an instance of {cls}' - .format(name=name, cls=base_class_name)) + raise ValueError(f"{name} must be an instance of {base_class_name}") return obj or default -def _initialize_models(obj, tokenizer, pos_tagger, - np_extractor, analyzer, parser, classifier): +def _initialize_models( + obj, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier +): """Common initialization between BaseBlob and Blobber classes.""" # tokenizer may be a textblob or an NLTK tokenizer - obj.tokenizer = _validated_param(tokenizer, "tokenizer", - base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI), - default=BaseBlob.tokenizer, - base_class_name="BaseTokenizer") - obj.np_extractor = _validated_param(np_extractor, "np_extractor", - base_class=BaseNPExtractor, - default=BaseBlob.np_extractor) - obj.pos_tagger = _validated_param(pos_tagger, "pos_tagger", - BaseTagger, BaseBlob.pos_tagger) - obj.analyzer = _validated_param(analyzer, "analyzer", - BaseSentimentAnalyzer, BaseBlob.analyzer) + obj.tokenizer = _validated_param( + tokenizer, + "tokenizer", + base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI), + default=BaseBlob.tokenizer, + base_class_name="BaseTokenizer", + ) + obj.np_extractor = _validated_param( + np_extractor, + "np_extractor", + base_class=BaseNPExtractor, + default=BaseBlob.np_extractor, + ) + obj.pos_tagger = _validated_param( + pos_tagger, "pos_tagger", BaseTagger, BaseBlob.pos_tagger + ) + obj.analyzer = _validated_param( + analyzer, "analyzer", BaseSentimentAnalyzer, BaseBlob.analyzer + ) obj.parser = _validated_param(parser, "parser", BaseParser, BaseBlob.parser) obj.classifier = classifier @@ -369,28 +348,41 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin): .. versionchanged:: 0.6.0 ``clean_html`` parameter deprecated, as it was in NLTK. - """ + """ # noqa: E501 + np_extractor = FastNPExtractor() pos_tagger = NLTKTagger() tokenizer = WordTokenizer() - translator = Translator() analyzer = PatternAnalyzer() parser = PatternParser() - def __init__(self, text, tokenizer=None, - pos_tagger=None, np_extractor=None, analyzer=None, - parser=None, classifier=None, clean_html=False): + def __init__( + self, + text, + tokenizer=None, + pos_tagger=None, + np_extractor=None, + analyzer=None, + parser=None, + classifier=None, + clean_html=False, + ): if not isinstance(text, basestring): - raise TypeError('The `text` argument passed to `__init__(text)` ' - 'must be a string, not {0}'.format(type(text))) + raise TypeError( + "The `text` argument passed to `__init__(text)` " + f"must be a string, not {type(text)}" + ) if clean_html: - raise NotImplementedError("clean_html has been deprecated. " - "To remove HTML markup, use BeautifulSoup's " - "get_text() function") + raise NotImplementedError( + "clean_html has been deprecated. " + "To remove HTML markup, use BeautifulSoup's " + "get_text() function" + ) self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) - _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, - parser, classifier) + _initialize_models( + self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier + ) @cached_property def words(self): @@ -479,9 +471,13 @@ def subjectivity(self): @cached_property def noun_phrases(self): """Returns a list of noun phrases for this blob.""" - return WordList([phrase.strip().lower() - for phrase in self.np_extractor.extract(self.raw) - if len(phrase) > 1]) + return WordList( + [ + phrase.strip().lower() + for phrase in self.np_extractor.extract(self.raw) + if len(phrase) > 1 + ] + ) @cached_property def pos_tags(self): @@ -496,18 +492,23 @@ def pos_tags(self): :rtype: list of tuples """ if isinstance(self, TextBlob): - return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist] + return [ + val + for sublist in [s.pos_tags for s in self.sentences] + for val in sublist + ] else: - return [(Word(unicode(word), pos_tag=t), unicode(t)) - for word, t in self.pos_tagger.tag(self) - if not PUNCTUATION_REGEX.match(unicode(t))] + return [ + (Word(str(word), pos_tag=t), str(t)) + for word, t in self.pos_tagger.tag(self) + if not PUNCTUATION_REGEX.match(str(t)) + ] tags = pos_tags @cached_property def word_counts(self): - """Dictionary of word frequencies in this text. - """ + """Dictionary of word frequencies in this text.""" counts = defaultdict(int) stripped_words = [lowerstrip(word) for word in self.words] for word in stripped_words: @@ -516,8 +517,7 @@ def word_counts(self): @cached_property def np_counts(self): - """Dictionary of noun phrase frequencies in this text. - """ + """Dictionary of noun phrase frequencies in this text.""" counts = defaultdict(int) for phrase in self.noun_phrases: counts[phrase] += 1 @@ -531,71 +531,11 @@ def ngrams(self, n=3): """ if n <= 0: return [] - grams = [WordList(self.words[i:i + n]) - for i in range(len(self.words) - n + 1)] + grams = [ + WordList(self.words[i : i + n]) for i in range(len(self.words) - n + 1) + ] return grams - def translate(self, from_lang="auto", to="en"): - """Translate the blob to another language. - Uses the Google Translate API. Returns a new TextBlob. - - Requires an internet connection. - - Usage: - :: - - >>> b = TextBlob("Simple is better than complex") - >>> b.translate(to="es") - TextBlob('Lo simple es mejor que complejo') - - Language code reference: - https://developers.google.com/translate/v2/using_rest#language-params - - .. deprecated:: 0.16.0 - Use the official Google Translate API instead. - .. versionadded:: 0.5.0. - - :param str from_lang: Language to translate from. If ``None``, will attempt - to detect the language. - :param str to: Language to translate to. - :rtype: :class:`BaseBlob ` - """ - warnings.warn( - 'TextBlob.translate is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.__class__(self.translator.translate(self.raw, - from_lang=from_lang, to_lang=to)) - - def detect_language(self): - """Detect the blob's language using the Google Translate API. - - Requires an internet connection. - - Usage: - :: - - >>> b = TextBlob("bonjour") - >>> b.detect_language() - u'fr' - - Language code reference: - https://developers.google.com/translate/v2/using_rest#language-params - - .. deprecated:: 0.16.0 - Use the official Google Translate API instead. - .. versionadded:: 0.5.0 - - :rtype: str - """ - warnings.warn( - 'TextBlob.detext_translate is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.translator.detect(self.raw) - def correct(self): """Attempt to correct the spelling of a blob. @@ -606,7 +546,7 @@ def correct(self): # regex matches: word or punctuation or whitespace tokens = nltk.tokenize.regexp_tokenize(self.raw, r"\w+|[^\w\s]|\s") corrected = (Word(w).correct() for w in tokens) - ret = ''.join(corrected) + ret = "".join(corrected) return self.__class__(ret) def _cmpkey(self): @@ -623,19 +563,20 @@ def __hash__(self): return hash(self._cmpkey()) def __add__(self, other): - '''Concatenates two text objects the same way Python strings are + """Concatenates two text objects the same way Python strings are concatenated. Arguments: - `other`: a string or a text object - ''' + """ if isinstance(other, basestring): return self.__class__(self.raw + other) elif isinstance(other, BaseBlob): return self.__class__(self.raw + other.raw) else: - raise TypeError('Operands must be either strings or {0} objects' - .format(self.__class__.__name__)) + raise TypeError( + f"Operands must be either strings or {self.__class__.__name__} objects" + ) def split(self, sep=None, maxsplit=sys.maxsize): """Behaves like the built-in str.split() except returns a @@ -660,7 +601,7 @@ class TextBlob(BaseBlob): :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to :class:`PatternAnalyzer `. :param classifier: (optional) A classifier. - """ + """ # noqa: E501 @cached_property def sentences(self): @@ -688,26 +629,25 @@ def serialized(self): return [sentence.dict for sentence in self.sentences] def to_json(self, *args, **kwargs): - '''Return a json representation (str) of this blob. + """Return a json representation (str) of this blob. Takes the same arguments as json.dumps. .. versionadded:: 0.5.1 - ''' + """ return json.dumps(self.serialized, *args, **kwargs) @property def json(self): - '''The json representation of this blob. + """The json representation of this blob. .. versionchanged:: 0.5.1 Made ``json`` a property instead of a method to restore backwards compatibility that was broken after version 0.4.0. - ''' + """ return self.to_json() def _create_sentence_objects(self): - '''Returns a list of Sentence objects from the raw text. - ''' + """Returns a list of Sentence objects from the raw text.""" sentence_objects = [] sentences = sent_tokenize(self.raw) char_index = 0 # Keeps track of character index within the blob @@ -718,10 +658,17 @@ def _create_sentence_objects(self): char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob - s = Sentence(sent, start_index=start_index, end_index=end_index, - tokenizer=self.tokenizer, np_extractor=self.np_extractor, - pos_tagger=self.pos_tagger, analyzer=self.analyzer, - parser=self.parser, classifier=self.classifier) + s = Sentence( + sent, + start_index=start_index, + end_index=end_index, + tokenizer=self.tokenizer, + np_extractor=self.np_extractor, + pos_tagger=self.pos_tagger, + analyzer=self.analyzer, + parser=self.parser, + classifier=self.classifier, + ) sentence_objects.append(s) return sentence_objects @@ -738,7 +685,7 @@ class Sentence(BaseBlob): """ def __init__(self, sentence, start_index=0, end_index=None, *args, **kwargs): - super(Sentence, self).__init__(sentence, *args, **kwargs) + super().__init__(sentence, *args, **kwargs) #: The start index within a TextBlob self.start = self.start_index = start_index #: The end index within a textBlob @@ -746,19 +693,19 @@ def __init__(self, sentence, start_index=0, end_index=None, *args, **kwargs): @property def dict(self): - '''The dict representation of this sentence.''' + """The dict representation of this sentence.""" return { - 'raw': self.raw, - 'start_index': self.start_index, - 'end_index': self.end_index, - 'stripped': self.stripped, - 'noun_phrases': self.noun_phrases, - 'polarity': self.polarity, - 'subjectivity': self.subjectivity, + "raw": self.raw, + "start_index": self.start_index, + "end_index": self.end_index, + "stripped": self.stripped, + "noun_phrases": self.noun_phrases, + "polarity": self.polarity, + "subjectivity": self.subjectivity, } -class Blobber(object): +class Blobber: """A factory for TextBlobs that all share the same tagger, tokenizer, parser, classifier, and np_extractor. @@ -786,7 +733,7 @@ class Blobber(object): :param classifier: A classifier. .. versionadded:: 0.4.0 - """ + """ # noqa: E501 np_extractor = FastNPExtractor() pos_tagger = NLTKTagger() @@ -794,10 +741,18 @@ class Blobber(object): analyzer = PatternAnalyzer() parser = PatternParser() - def __init__(self, tokenizer=None, pos_tagger=None, np_extractor=None, - analyzer=None, parser=None, classifier=None): - _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, - parser, classifier) + def __init__( + self, + tokenizer=None, + pos_tagger=None, + np_extractor=None, + analyzer=None, + parser=None, + classifier=None, + ): + _initialize_models( + self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier + ) def __call__(self, text): """Return a new TextBlob object with this Blobber's ``np_extractor``, @@ -805,20 +760,30 @@ def __call__(self, text): :returns: A new :class:`TextBlob `. """ - return TextBlob(text, tokenizer=self.tokenizer, pos_tagger=self.pos_tagger, - np_extractor=self.np_extractor, analyzer=self.analyzer, - parser=self.parser, - classifier=self.classifier) + return TextBlob( + text, + tokenizer=self.tokenizer, + pos_tagger=self.pos_tagger, + np_extractor=self.np_extractor, + analyzer=self.analyzer, + parser=self.parser, + classifier=self.classifier, + ) def __repr__(self): - classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None" - return ("Blobber(tokenizer={0}(), pos_tagger={1}(), " - "np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\ - .format(self.tokenizer.__class__.__name__, - self.pos_tagger.__class__.__name__, - self.np_extractor.__class__.__name__, - self.analyzer.__class__.__name__, - self.parser.__class__.__name__, - classifier_name) + classifier_name = ( + self.classifier.__class__.__name__ + "()" if self.classifier else "None" + ) + return ( + "Blobber(tokenizer={}(), pos_tagger={}(), " + "np_extractor={}(), analyzer={}(), parser={}(), classifier={})" + ).format( + self.tokenizer.__class__.__name__, + self.pos_tagger.__class__.__name__, + self.np_extractor.__class__.__name__, + self.analyzer.__class__.__name__, + self.parser.__class__.__name__, + classifier_name, + ) __str__ = __repr__ diff --git a/textblob/classifiers.py b/src/textblob/classifiers.py similarity index 79% rename from textblob/classifiers.py rename to src/textblob/classifiers.py index 9e0b5b20..74461e2c 100644 --- a/textblob/classifiers.py +++ b/src/textblob/classifiers.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Various classifier implementations. Also includes basic feature extractor methods. @@ -30,18 +29,18 @@ neg .. versionadded:: 0.6.0 -""" -from __future__ import absolute_import +""" # noqa: E501 from itertools import chain import nltk -from textblob.compat import basestring +import textblob.formats as formats from textblob.decorators import cached_property from textblob.exceptions import FormatError from textblob.tokenizers import word_tokenize -from textblob.utils import strip_punc, is_filelike -import textblob.formats as formats +from textblob.utils import is_filelike, strip_punc + +basestring = (str, bytes) ### Basic feature extractors ### @@ -52,6 +51,7 @@ def _get_words_from_dataset(dataset): :param dataset: A list of tuples of the form ``(words, label)`` where ``words`` is either a string of a list of tokens. """ + # Words may be either a string or a list of tokens. Return an iterator # of tokens accordingly def tokenize(words): @@ -59,17 +59,22 @@ def tokenize(words): return word_tokenize(words, include_punc=False) else: return words + all_words = chain.from_iterable(tokenize(words) for words, _ in dataset) return set(all_words) + def _get_document_tokens(document): if isinstance(document, basestring): - tokens = set((strip_punc(w, all=False) - for w in word_tokenize(document, include_punc=False))) + tokens = set( + strip_punc(w, all=False) + for w in word_tokenize(document, include_punc=False) + ) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens + def basic_extractor(document, train_set): """A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. @@ -87,14 +92,13 @@ def basic_extractor(document, train_set): word_features = [w for w in chain([el_zero], train_set)] else: try: - assert(isinstance(el_zero[0], basestring)) + assert isinstance(el_zero[0], basestring) word_features = _get_words_from_dataset(chain([el_zero], train_set)) - except Exception: - raise ValueError('train_set is probably malformed.') + except Exception as error: + raise ValueError("train_set is probably malformed.") from error tokens = _get_document_tokens(document) - features = dict(((u'contains({0})'.format(word), (word in tokens)) - for word in word_features)) + features = dict((f"contains({word})", (word in tokens)) for word in word_features) return features @@ -103,12 +107,14 @@ def contains_extractor(document): the document contains. """ tokens = _get_document_tokens(document) - features = dict((u'contains({0})'.format(w), True) for w in tokens) + features = dict((f"contains({w})", True) for w in tokens) return features + ##### CLASSIFIERS ##### -class BaseClassifier(object): + +class BaseClassifier: """Abstract classifier class from which all classifers inherit. At a minimum, descendant classes must implement a ``classify`` method and have a ``classifier`` property. @@ -129,14 +135,18 @@ class BaseClassifier(object): .. versionadded:: 0.6.0 """ - def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **kwargs): + def __init__( + self, train_set, feature_extractor=basic_extractor, format=None, **kwargs + ): self.format_kwargs = kwargs self.feature_extractor = feature_extractor if is_filelike(train_set): self.train_set = self._read_data(train_set, format) else: # train_set is a list of tuples self.train_set = train_set - self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words. + self._word_set = _get_words_from_dataset( + self.train_set + ) # Keep a hidden set of unique words. self.train_features = None def _read_data(self, dataset, format=None): @@ -147,12 +157,14 @@ def _read_data(self, dataset, format=None): if not format: format_class = formats.detect(dataset) if not format_class: - raise FormatError('Could not automatically detect format for the given ' - 'data source.') + raise FormatError( + "Could not automatically detect format for the given " + "data source." + ) else: registry = formats.get_registry() if format not in registry.keys(): - raise ValueError("'{0}' format not supported.".format(format)) + raise ValueError(f"'{format}' format not supported.") format_class = registry[format] return format_class(dataset, **self.format_kwargs).to_iterable() @@ -174,10 +186,10 @@ def labels(self): raise NotImplementedError('Must implement a "labels" method.') def extract_features(self, text): - '''Extracts features from a body of text. + """Extracts features from a body of text. :rtype: dictionary of features - ''' + """ # Feature extractor may take one or two arguments try: return self.feature_extractor(text, self._word_set) @@ -200,24 +212,25 @@ class MyClassifier(NLTKClassifier): #: The NLTK class to be wrapped. Must be a class within nltk.classify nltk_class = None - def __init__(self, train_set, - feature_extractor=basic_extractor, format=None, **kwargs): - super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs) + def __init__( + self, train_set, feature_extractor=basic_extractor, format=None, **kwargs + ): + super().__init__(train_set, feature_extractor, format, **kwargs) self.train_features = [(self.extract_features(d), c) for d, c in self.train_set] def __repr__(self): class_name = self.__class__.__name__ - return "<{cls} trained on {n} instances>".format(cls=class_name, - n=len(self.train_set)) + return f"<{class_name} trained on {len(self.train_set)} instances>" @cached_property def classifier(self): """The classifier.""" try: return self.train() - except AttributeError: # nltk_class has not been defined - raise ValueError("NLTKClassifier must have a nltk_class" - " variable that is not None.") + except AttributeError as error: # nltk_class has not been defined + raise ValueError( + "NLTKClassifier must have a nltk_class" " variable that is not None." + ) from error def train(self, *args, **kwargs): """Train the classifier with a labeled feature set and return @@ -231,12 +244,14 @@ def train(self, *args, **kwargs): :rtype: A classifier """ try: - self.classifier = self.nltk_class.train(self.train_features, - *args, **kwargs) + self.classifier = self.nltk_class.train( + self.train_features, *args, **kwargs + ) return self.classifier - except AttributeError: - raise ValueError("NLTKClassifier must have a nltk_class" - " variable that is not None.") + except AttributeError as error: + raise ValueError( + "NLTKClassifier must have a nltk_class" " variable that is not None." + ) from error def labels(self): """Return an iterable of possible labels.""" @@ -275,14 +290,15 @@ def update(self, new_data, *args, **kwargs): """ self.train_set += new_data self._word_set.update(_get_words_from_dataset(new_data)) - self.train_features = [(self.extract_features(d), c) - for d, c in self.train_set] + self.train_features = [(self.extract_features(d), c) for d, c in self.train_set] try: - self.classifier = self.nltk_class.train(self.train_features, - *args, **kwargs) - except AttributeError: # Descendant has not defined nltk_class - raise ValueError("NLTKClassifier must have a nltk_class" - " variable that is not None.") + self.classifier = self.nltk_class.train( + self.train_features, *args, **kwargs + ) + except AttributeError as error: # Descendant has not defined nltk_class + raise ValueError( + "NLTKClassifier must have a nltk_class" " variable that is not None." + ) from error return True @@ -421,23 +437,27 @@ class PositiveNaiveBayesClassifier(NLTKClassifier): nltk_class = nltk.classify.PositiveNaiveBayesClassifier - def __init__(self, positive_set, unlabeled_set, - feature_extractor=contains_extractor, - positive_prob_prior=0.5, **kwargs): + def __init__( + self, + positive_set, + unlabeled_set, + feature_extractor=contains_extractor, + positive_prob_prior=0.5, + **kwargs, + ): self.feature_extractor = feature_extractor self.positive_set = positive_set self.unlabeled_set = unlabeled_set - self.positive_features = [self.extract_features(d) - for d in self.positive_set] - self.unlabeled_features = [self.extract_features(d) - for d in self.unlabeled_set] + self.positive_features = [self.extract_features(d) for d in self.positive_set] + self.unlabeled_features = [self.extract_features(d) for d in self.unlabeled_set] self.positive_prob_prior = positive_prob_prior def __repr__(self): class_name = self.__class__.__name__ - return "<{cls} trained on {n_pos} labeled and {n_unlabeled} unlabeled instances>"\ - .format(cls=class_name, n_pos=len(self.positive_set), - n_unlabeled=len(self.unlabeled_set)) + return ( + f"<{class_name} trained on {len(self.positive_set)} labeled " + f"and {len(self.unlabeled_set)} unlabeled instances>" + ) # Override def train(self, *args, **kwargs): @@ -449,14 +469,19 @@ def train(self, *args, **kwargs): :rtype: A classifier """ - self.classifier = self.nltk_class.train(self.positive_features, - self.unlabeled_features, - self.positive_prob_prior) + self.classifier = self.nltk_class.train( + self.positive_features, self.unlabeled_features, self.positive_prob_prior + ) return self.classifier - def update(self, new_positive_data=None, - new_unlabeled_data=None, positive_prob_prior=0.5, - *args, **kwargs): + def update( + self, + new_positive_data=None, + new_unlabeled_data=None, + positive_prob_prior=0.5, + *args, + **kwargs, + ): """Update the classifier with new data and re-trains the classifier. @@ -466,16 +491,21 @@ def update(self, new_positive_data=None, self.positive_prob_prior = positive_prob_prior if new_positive_data: self.positive_set += new_positive_data - self.positive_features += [self.extract_features(d) - for d in new_positive_data] + self.positive_features += [ + self.extract_features(d) for d in new_positive_data + ] if new_unlabeled_data: self.unlabeled_set += new_unlabeled_data - self.unlabeled_features += [self.extract_features(d) - for d in new_unlabeled_data] - self.classifier = self.nltk_class.train(self.positive_features, - self.unlabeled_features, - self.positive_prob_prior, - *args, **kwargs) + self.unlabeled_features += [ + self.extract_features(d) for d in new_unlabeled_data + ] + self.classifier = self.nltk_class.train( + self.positive_features, + self.unlabeled_features, + self.positive_prob_prior, + *args, + **kwargs, + ) return True diff --git a/textblob/decorators.py b/src/textblob/decorators.py similarity index 77% rename from textblob/decorators.py rename to src/textblob/decorators.py index 1603266a..9b91ce87 100644 --- a/textblob/decorators.py +++ b/src/textblob/decorators.py @@ -1,12 +1,11 @@ -# -*- coding: utf-8 -*- """Custom decorators.""" -from __future__ import absolute_import from functools import wraps + from textblob.exceptions import MissingCorpusError -class cached_property(object): +class cached_property: """A property that is only computed once per instance and then replaces itself with an ordinary attribute. Deleting the attribute resets the property. @@ -15,7 +14,7 @@ class cached_property(object): """ def __init__(self, func): - self.__doc__ = getattr(func, '__doc__') + self.__doc__ = func.__doc__ self.func = func def __get__(self, obj, cls): @@ -29,11 +28,12 @@ def requires_nltk_corpus(func): """Wraps a function that requires an NLTK corpus. If the corpus isn't found, raise a :exc:`MissingCorpusError`. """ + @wraps(func) def decorated(*args, **kwargs): try: return func(*args, **kwargs) - except LookupError as err: - print(err) - raise MissingCorpusError() + except LookupError as error: + raise MissingCorpusError() from error + return decorated diff --git a/textblob/download_corpora.py b/src/textblob/download_corpora.py similarity index 63% rename from textblob/download_corpora.py rename to src/textblob/download_corpora.py index 47231a80..d51ccd4f 100644 --- a/textblob/download_corpora.py +++ b/src/textblob/download_corpora.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- """Downloads the necessary NLTK corpora for TextBlob. Usage: :: @@ -13,22 +12,24 @@ """ import sys + import nltk MIN_CORPORA = [ - 'brown', # Required for FastNPExtractor - 'punkt', # Required for WordTokenizer - 'wordnet', # Required for lemmatization - 'averaged_perceptron_tagger', # Required for NLTKTagger + "brown", # Required for FastNPExtractor + "punkt", # Required for WordTokenizer + "wordnet", # Required for lemmatization + "averaged_perceptron_tagger", # Required for NLTKTagger ] ADDITIONAL_CORPORA = [ - 'conll2000', # Required for ConllExtractor - 'movie_reviews', # Required for NaiveBayesAnalyzer + "conll2000", # Required for ConllExtractor + "movie_reviews", # Required for NaiveBayesAnalyzer ] ALL_CORPORA = MIN_CORPORA + ADDITIONAL_CORPORA + def download_lite(): for each in MIN_CORPORA: nltk.download(each) @@ -40,12 +41,12 @@ def download_all(): def main(): - if 'lite' in sys.argv: + if "lite" in sys.argv: download_lite() else: download_all() print("Finished.") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/textblob/en/__init__.py b/src/textblob/en/__init__.py new file mode 100644 index 00000000..0f080643 --- /dev/null +++ b/src/textblob/en/__init__.py @@ -0,0 +1,133 @@ +"""This file is based on pattern.en. See the bundled NOTICE file for +license information. +""" +import os + +from textblob._text import CHUNK, PENN, PNP, POS, UNIVERSAL, WORD, Lexicon, Spelling +from textblob._text import Parser as _Parser +from textblob._text import Sentiment as _Sentiment + +try: + MODULE = os.path.dirname(os.path.abspath(__file__)) +except: + MODULE = "" + +spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) + +# --- ENGLISH PARSER -------------------------------------------------------------------------------- + + +def find_lemmata(tokens): + """Annotates the tokens with lemmata for plural nouns and conjugated verbs, + where each token is a [word, part-of-speech] list. + """ + for token in tokens: + word, pos, lemma = token[0], token[1], token[0] + # cats => cat + if pos == "NNS": + lemma = singularize(word) + # sat => sit + if pos.startswith(("VB", "MD")): + lemma = conjugate(word, INFINITIVE) or word + token.append(lemma.lower()) + return tokens + + +class Parser(_Parser): + def find_lemmata(self, tokens, **kwargs): + return find_lemmata(tokens) + + def find_tags(self, tokens, **kwargs): + if kwargs.get("tagset") in (PENN, None): + kwargs.setdefault("map", lambda token, tag: (token, tag)) + if kwargs.get("tagset") == UNIVERSAL: + kwargs.setdefault( + "map", lambda token, tag: penntreebank2universal(token, tag) + ) + return _Parser.find_tags(self, tokens, **kwargs) + + +class Sentiment(_Sentiment): + def load(self, path=None): + _Sentiment.load(self, path) + # Map "terrible" to adverb "terribly" (+1% accuracy) + if not path: + for w, pos in list(dict.items(self)): + if "JJ" in pos: + if w.endswith("y"): + w = w[:-1] + "i" + if w.endswith("le"): + w = w[:-2] + p, s, i = pos["JJ"] + self.annotate(w + "ly", "RB", p, s, i) + + +lexicon = Lexicon( + path=os.path.join(MODULE, "en-lexicon.txt"), + morphology=os.path.join(MODULE, "en-morphology.txt"), + context=os.path.join(MODULE, "en-context.txt"), + entities=os.path.join(MODULE, "en-entities.txt"), + language="en", +) +parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") + +sentiment = Sentiment( + path=os.path.join(MODULE, "en-sentiment.xml"), + synset="wordnet_id", + negations=("no", "not", "n't", "never"), + modifiers=("RB",), + modifier=lambda w: w.endswith("ly"), + tokenizer=parser.find_tokens, + language="en", +) + + +def tokenize(s, *args, **kwargs): + """Returns a list of sentences, where punctuation marks have been split from words.""" + return parser.find_tokens(str(s), *args, **kwargs) + + +def parse(s, *args, **kwargs): + """Returns a tagged str string.""" + return parser.parse(str(s), *args, **kwargs) + + +def parsetree(s, *args, **kwargs): + """Returns a parsed Text from the given string.""" + return Text(parse(str(s), *args, **kwargs)) + + +def split(s, token=None): + """Returns a parsed Text from the given parsed string.""" + if token is None: + token = [WORD, POS, CHUNK, PNP] + return Text(str(s), token) + + +def tag(s, tokenize=True, encoding="utf-8"): + """Returns a list of (token, tag)-tuples from the given string.""" + tags = [] + for sentence in parse(s, tokenize, True, False, False, False, encoding).split(): + for token in sentence: + tags.append((token[0], token[1])) + return tags + + +def suggest(w): + """Returns a list of (word, confidence)-tuples of spelling corrections.""" + return spelling.suggest(w) + + +def polarity(s, **kwargs): + """Returns the sentence polarity (positive/negative) between -1.0 and 1.0.""" + return sentiment(str(s), **kwargs)[0] + + +def subjectivity(s, **kwargs): + """Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.""" + return sentiment(str(s), **kwargs)[1] + + +def positive(s, threshold=0.1, **kwargs): + """Returns True if the given sentence has a positive sentiment (polarity >= threshold).""" + return polarity(str(s), **kwargs) >= threshold diff --git a/textblob/en/en-context.txt b/src/textblob/en/en-context.txt similarity index 100% rename from textblob/en/en-context.txt rename to src/textblob/en/en-context.txt diff --git a/textblob/en/en-entities.txt b/src/textblob/en/en-entities.txt similarity index 100% rename from textblob/en/en-entities.txt rename to src/textblob/en/en-entities.txt diff --git a/textblob/en/en-lexicon.txt b/src/textblob/en/en-lexicon.txt similarity index 100% rename from textblob/en/en-lexicon.txt rename to src/textblob/en/en-lexicon.txt diff --git a/textblob/en/en-morphology.txt b/src/textblob/en/en-morphology.txt similarity index 100% rename from textblob/en/en-morphology.txt rename to src/textblob/en/en-morphology.txt diff --git a/textblob/en/en-sentiment.xml b/src/textblob/en/en-sentiment.xml similarity index 100% rename from textblob/en/en-sentiment.xml rename to src/textblob/en/en-sentiment.xml diff --git a/textblob/en/en-spelling.txt b/src/textblob/en/en-spelling.txt similarity index 100% rename from textblob/en/en-spelling.txt rename to src/textblob/en/en-spelling.txt diff --git a/src/textblob/en/inflect.py b/src/textblob/en/inflect.py new file mode 100644 index 00000000..3d4ba244 --- /dev/null +++ b/src/textblob/en/inflect.py @@ -0,0 +1,878 @@ +"""The pluralize and singular methods from the pattern library. + +Licenced under the BSD. +See here https://github.com/clips/pattern/blob/master/LICENSE.txt for +complete license information. +""" +import re + +VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB" + +#### PLURALIZE ##################################################################################### +# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway: +# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html + +# Prepositions are used to solve things like +# "mother-in-law" or "man at arms" +plural_prepositions = [ + "about", + "above", + "across", + "after", + "among", + "around", + "at", + "athwart", + "before", + "behind", + "below", + "beneath", + "beside", + "besides", + "between", + "betwixt", + "beyond", + "but", + "by", + "during", + "except", + "for", + "from", + "in", + "into", + "near", + "of", + "off", + "on", + "onto", + "out", + "over", + "since", + "till", + "to", + "under", + "until", + "unto", + "upon", + "with", +] + +# Inflection rules that are either general, +# or apply to a certain category of words, +# or apply to a certain category of words only in classical mode, +# or apply only in classical mode. +# Each rule consists of: +# suffix, inflection, category and classic flag. +plural_rules = [ + # 0) Indefinite articles and demonstratives. + [ + ["^a$|^an$", "some", None, False], + ["^this$", "these", None, False], + ["^that$", "those", None, False], + ["^any$", "all", None, False], + ], + # 1) Possessive adjectives. + # Overlaps with 1/ for "his" and "its". + # Overlaps with 2/ for "her". + [ + ["^my$", "our", None, False], + ["^your$|^thy$", "your", None, False], + ["^her$|^his$|^its$|^their$", "their", None, False], + ], + # 2) Possessive pronouns. + [ + ["^mine$", "ours", None, False], + ["^yours$|^thine$", "yours", None, False], + ["^hers$|^his$|^its$|^theirs$", "theirs", None, False], + ], + # 3) Personal pronouns. + [ + ["^I$", "we", None, False], + ["^me$", "us", None, False], + ["^myself$", "ourselves", None, False], + ["^you$", "you", None, False], + ["^thou$|^thee$", "ye", None, False], + ["^yourself$|^thyself$", "yourself", None, False], + ["^she$|^he$|^it$|^they$", "they", None, False], + ["^her$|^him$|^it$|^them$", "them", None, False], + ["^herself$|^himself$|^itself$|^themself$", "themselves", None, False], + ["^oneself$", "oneselves", None, False], + ], + # 4) Words that do not inflect. + [ + ["$", "", "uninflected", False], + ["$", "", "uncountable", False], + ["fish$", "fish", None, False], + ["([- ])bass$", "\\1bass", None, False], + ["ois$", "ois", None, False], + ["sheep$", "sheep", None, False], + ["deer$", "deer", None, False], + ["pox$", "pox", None, False], + ["([A-Z].*)ese$", "\\1ese", None, False], + ["itis$", "itis", None, False], + [ + "(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", + "\\1ose", + None, + False, + ], + ], + # 5) Irregular plurals (mongoose, oxen). + [ + ["atlas$", "atlantes", None, True], + ["atlas$", "atlases", None, False], + ["beef$", "beeves", None, True], + ["brother$", "brethren", None, True], + ["child$", "children", None, False], + ["corpus$", "corpora", None, True], + ["corpus$", "corpuses", None, False], + ["^cow$", "kine", None, True], + ["ephemeris$", "ephemerides", None, False], + ["ganglion$", "ganglia", None, True], + ["genie$", "genii", None, True], + ["genus$", "genera", None, False], + ["graffito$", "graffiti", None, False], + ["loaf$", "loaves", None, False], + ["money$", "monies", None, True], + ["mongoose$", "mongooses", None, False], + ["mythos$", "mythoi", None, False], + ["octopus$", "octopodes", None, True], + ["opus$", "opera", None, True], + ["opus$", "opuses", None, False], + ["^ox$", "oxen", None, False], + ["penis$", "penes", None, True], + ["penis$", "penises", None, False], + ["soliloquy$", "soliloquies", None, False], + ["testis$", "testes", None, False], + ["trilby$", "trilbys", None, False], + ["turf$", "turves", None, True], + ["numen$", "numena", None, False], + ["occiput$", "occipita", None, True], + ], + # 6) Irregular inflections for common suffixes (synopses, mice, men). + [ + ["man$", "men", None, False], + ["person$", "people", None, False], + ["([lm])ouse$", "\\1ice", None, False], + ["tooth$", "teeth", None, False], + ["goose$", "geese", None, False], + ["foot$", "feet", None, False], + ["zoon$", "zoa", None, False], + ["([csx])is$", "\\1es", None, False], + ], + # 7) Fully assimilated classical inflections (vertebrae, codices). + [ + ["ex$", "ices", "ex-ices", False], + ["ex$", "ices", "ex-ices-classical", True], + ["um$", "a", "um-a", False], + ["um$", "a", "um-a-classical", True], + ["on$", "a", "on-a", False], + ["a$", "ae", "a-ae", False], + ["a$", "ae", "a-ae-classical", True], + ], + # 8) Classical variants of modern inflections (stigmata, soprani). + [ + ["trix$", "trices", None, True], + ["eau$", "eaux", None, True], + ["ieu$", "ieu", None, True], + ["([iay])nx$", "\\1nges", None, True], + ["en$", "ina", "en-ina-classical", True], + ["a$", "ata", "a-ata-classical", True], + ["is$", "ides", "is-ides-classical", True], + ["us$", "i", "us-i-classical", True], + ["us$", "us", "us-us-classical", True], + ["o$", "i", "o-i-classical", True], + ["$", "i", "-i-classical", True], + ["$", "im", "-im-classical", True], + ], + # 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses). + [ + ["([cs])h$", "\\1hes", None, False], + ["ss$", "sses", None, False], + ["x$", "xes", None, False], + ["s$", "ses", "s-singular", False], + ], + # 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves). + [ + ["([aeo]l)f$", "\\1ves", None, False], + ["([^d]ea)f$", "\\1ves", None, False], + ["arf$", "arves", None, False], + ["([nlw]i)fe$", "\\1ves", None, False], + ], + # 11) -y takes -ys if preceded by a vowel or when a proper noun, + # but -ies if preceded by a consonant (storeys, Marys, stories). + [ + ["([aeiou])y$", "\\1ys", None, False], + ["([A-Z].*)y$", "\\1ys", None, False], + ["y$", "ies", None, False], + ], + # 12) Some words ending in -o take -os, the rest take -oes. + # Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos). + [ + ["o$", "os", "o-os", False], + ["([aeiou])o$", "\\1os", None, False], + ["o$", "oes", None, False], + ], + # 13) Miltary stuff (Major Generals). + [["l$", "ls", "general-generals", False]], + # 14) Otherwise, assume that the plural just adds -s (cats, programmes). + [["$", "s", None, False]], +] + +# For performance, compile the regular expressions only once: +for ruleset in plural_rules: + for rule in ruleset: + rule[0] = re.compile(rule[0]) + +# Suffix categories. +plural_categories = { + "uninflected": [ + "aircraft", + "antelope", + "bison", + "bream", + "breeches", + "britches", + "carp", + "cattle", + "chassis", + "clippers", + "cod", + "contretemps", + "corps", + "debris", + "diabetes", + "djinn", + "eland", + "elk", + "flounder", + "gallows", + "graffiti", + "headquarters", + "herpes", + "high-jinks", + "homework", + "innings", + "jackanapes", + "mackerel", + "measles", + "mews", + "moose", + "mumps", + "offspring", + "news", + "pincers", + "pliers", + "proceedings", + "rabies", + "salmon", + "scissors", + "series", + "shears", + "species", + "swine", + "trout", + "tuna", + "whiting", + "wildebeest", + ], + "uncountable": [ + "advice", + "bread", + "butter", + "cannabis", + "cheese", + "electricity", + "equipment", + "fruit", + "furniture", + "garbage", + "gravel", + "happiness", + "information", + "ketchup", + "knowledge", + "love", + "luggage", + "mathematics", + "mayonnaise", + "meat", + "mustard", + "news", + "progress", + "research", + "rice", + "sand", + "software", + "understanding", + "water", + ], + "s-singular": [ + "acropolis", + "aegis", + "alias", + "asbestos", + "bathos", + "bias", + "bus", + "caddis", + "canvas", + "chaos", + "christmas", + "cosmos", + "dais", + "digitalis", + "epidermis", + "ethos", + "gas", + "glottis", + "ibis", + "lens", + "mantis", + "marquis", + "metropolis", + "pathos", + "pelvis", + "polis", + "rhinoceros", + "sassafras", + "trellis", + ], + "ex-ices": ["codex", "murex", "silex"], + "ex-ices-classical": [ + "apex", + "cortex", + "index", + "latex", + "pontifex", + "simplex", + "vertex", + "vortex", + ], + "um-a": [ + "agendum", + "bacterium", + "candelabrum", + "datum", + "desideratum", + "erratum", + "extremum", + "ovum", + "stratum", + ], + "um-a-classical": [ + "aquarium", + "compendium", + "consortium", + "cranium", + "curriculum", + "dictum", + "emporium", + "enconium", + "gymnasium", + "honorarium", + "interregnum", + "lustrum", + "maximum", + "medium", + "memorandum", + "millenium", + "minimum", + "momentum", + "optimum", + "phylum", + "quantum", + "rostrum", + "spectrum", + "speculum", + "stadium", + "trapezium", + "ultimatum", + "vacuum", + "velum", + ], + "on-a": [ + "aphelion", + "asyndeton", + "criterion", + "hyperbaton", + "noumenon", + "organon", + "perihelion", + "phenomenon", + "prolegomenon", + ], + "a-ae": ["alga", "alumna", "vertebra"], + "a-ae-classical": [ + "abscissa", + "amoeba", + "antenna", + "aurora", + "formula", + "hydra", + "hyperbola", + "lacuna", + "medusa", + "nebula", + "nova", + "parabola", + ], + "en-ina-classical": ["foramen", "lumen", "stamen"], + "a-ata-classical": [ + "anathema", + "bema", + "carcinoma", + "charisma", + "diploma", + "dogma", + "drama", + "edema", + "enema", + "enigma", + "gumma", + "lemma", + "lymphoma", + "magma", + "melisma", + "miasma", + "oedema", + "sarcoma", + "schema", + "soma", + "stigma", + "stoma", + "trauma", + ], + "is-ides-classical": ["clitoris", "iris"], + "us-i-classical": [ + "focus", + "fungus", + "genius", + "incubus", + "nimbus", + "nucleolus", + "radius", + "stylus", + "succubus", + "torus", + "umbilicus", + "uterus", + ], + "us-us-classical": [ + "apparatus", + "cantus", + "coitus", + "hiatus", + "impetus", + "nexus", + "plexus", + "prospectus", + "sinus", + "status", + ], + "o-i-classical": [ + "alto", + "basso", + "canto", + "contralto", + "crescendo", + "solo", + "soprano", + "tempo", + ], + "-i-classical": ["afreet", "afrit", "efreet"], + "-im-classical": ["cherub", "goy", "seraph"], + "o-os": [ + "albino", + "archipelago", + "armadillo", + "commando", + "ditto", + "dynamo", + "embryo", + "fiasco", + "generalissimo", + "ghetto", + "guano", + "inferno", + "jumbo", + "lingo", + "lumbago", + "magneto", + "manifesto", + "medico", + "octavo", + "photo", + "pro", + "quarto", + "rhino", + "stylo", + ], + "general-generals": [ + "Adjutant", + "Brigadier", + "Lieutenant", + "Major", + "Quartermaster", + "adjutant", + "brigadier", + "lieutenant", + "major", + "quartermaster", + ], +} + + +def pluralize(word, pos=NOUN, custom=None, classical=True): + """Returns the plural of a given word. + For example: child -> children. + Handles nouns and adjectives, using classical inflection by default + (e.g. where "matrix" pluralizes to "matrices" instead of "matrixes"). + The custom dictionary is for user-defined replacements. + """ + + if custom is None: + custom = {} + if word in custom: + return custom[word] + + # Recursion of genitives. + # Remove the apostrophe and any trailing -s, + # form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs'). + if word.endswith("'") or word.endswith("'s"): + owner = word.rstrip("'s") + owners = pluralize(owner, pos, custom, classical) + if owners.endswith("s"): + return owners + "'" + else: + return owners + "'s" + + # Recursion of compound words + # (Postmasters General, mothers-in-law, Roman deities). + words = word.replace("-", " ").split(" ") + if len(words) > 1: + if ( + words[1] == "general" + or words[1] == "General" + and words[0] not in plural_categories["general-generals"] + ): + return word.replace(words[0], pluralize(words[0], pos, custom, classical)) + elif words[1] in plural_prepositions: + return word.replace(words[0], pluralize(words[0], pos, custom, classical)) + else: + return word.replace(words[-1], pluralize(words[-1], pos, custom, classical)) + + # Only a very few number of adjectives inflect. + n = list(range(len(plural_rules))) + if pos.startswith(ADJECTIVE): + n = [0, 1] + + # Apply pluralization rules. + for i in n: + ruleset = plural_rules[i] + for rule in ruleset: + suffix, inflection, category, classic = rule + # A general rule, or a classic rule in classical mode. + if category is None: + if not classic or (classic and classical): + if suffix.search(word) is not None: + return suffix.sub(inflection, word) + # A rule relating to a specific category of words. + if category is not None: + if word in plural_categories[category] and ( + not classic or (classic and classical) + ): + if suffix.search(word) is not None: + return suffix.sub(inflection, word) + + +#### SINGULARIZE ################################################################################### +# Adapted from Bermi Ferrer's Inflector for Python: +# http://www.bermi.org/inflector/ + +# Copyright (c) 2006 Bermi Ferrer Martinez +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software to deal in this software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of this software, and to permit +# persons to whom this software is furnished to do so, subject to the following +# condition: +# +# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THIS SOFTWARE. + +singular_rules = [ + ["(?i)(.)ae$", "\\1a"], + ["(?i)(.)itis$", "\\1itis"], + ["(?i)(.)eaux$", "\\1eau"], + ["(?i)(quiz)zes$", "\\1"], + ["(?i)(matr)ices$", "\\1ix"], + ["(?i)(ap|vert|ind)ices$", "\\1ex"], + ["(?i)^(ox)en", "\\1"], + ["(?i)(alias|status)es$", "\\1"], + ["(?i)([octop|vir])i$", "\\1us"], + ["(?i)(cris|ax|test)es$", "\\1is"], + ["(?i)(shoe)s$", "\\1"], + ["(?i)(o)es$", "\\1"], + ["(?i)(bus)es$", "\\1"], + ["(?i)([m|l])ice$", "\\1ouse"], + ["(?i)(x|ch|ss|sh)es$", "\\1"], + ["(?i)(m)ovies$", "\\1ovie"], + ["(?i)(.)ombies$", "\\1ombie"], + ["(?i)(s)eries$", "\\1eries"], + ["(?i)([^aeiouy]|qu)ies$", "\\1y"], + # Certain words ending in -f or -fe take -ves in the plural (lives, wolves). + ["([aeo]l)ves$", "\\1f"], + ["([^d]ea)ves$", "\\1f"], + ["arves$", "arf"], + ["erves$", "erve"], + ["([nlw]i)ves$", "\\1fe"], + ["(?i)([lr])ves$", "\\1f"], + ["([aeo])ves$", "\\1ve"], + ["(?i)(sive)s$", "\\1"], + ["(?i)(tive)s$", "\\1"], + ["(?i)(hive)s$", "\\1"], + ["(?i)([^f])ves$", "\\1fe"], + # -es suffix. + ["(?i)(^analy)ses$", "\\1sis"], + ["(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "\\1\\2sis"], + ["(?i)(.)opses$", "\\1opsis"], + ["(?i)(.)yses$", "\\1ysis"], + ["(?i)(h|d|r|o|n|b|cl|p)oses$", "\\1ose"], + ["(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose"], + ["(?i)(.)oses$", "\\1osis"], + # -a + ["(?i)([ti])a$", "\\1um"], + ["(?i)(n)ews$", "\\1ews"], + ["(?i)s$", ""], +] + +# For performance, compile the regular expressions only once: +for rule in singular_rules: + rule[0] = re.compile(rule[0]) + +singular_uninflected = [ + "aircraft", + "antelope", + "bison", + "bream", + "breeches", + "britches", + "carp", + "cattle", + "chassis", + "clippers", + "cod", + "contretemps", + "corps", + "debris", + "diabetes", + "djinn", + "eland", + "elk", + "flounder", + "gallows", + "georgia", + "graffiti", + "headquarters", + "herpes", + "high-jinks", + "homework", + "innings", + "jackanapes", + "mackerel", + "measles", + "mews", + "moose", + "mumps", + "news", + "offspring", + "pincers", + "pliers", + "proceedings", + "rabies", + "salmon", + "scissors", + "series", + "shears", + "species", + "swine", + "swiss", + "trout", + "tuna", + "whiting", + "wildebeest", +] +singular_uncountable = [ + "advice", + "bread", + "butter", + "cannabis", + "cheese", + "electricity", + "equipment", + "fruit", + "furniture", + "garbage", + "gravel", + "happiness", + "information", + "ketchup", + "knowledge", + "love", + "luggage", + "mathematics", + "mayonnaise", + "meat", + "mustard", + "news", + "progress", + "research", + "rice", + "sand", + "software", + "understanding", + "water", +] +singular_ie = [ + "algerie", + "auntie", + "beanie", + "birdie", + "bogie", + "bombie", + "bookie", + "collie", + "cookie", + "cutie", + "doggie", + "eyrie", + "freebie", + "goonie", + "groupie", + "hankie", + "hippie", + "hoagie", + "hottie", + "indie", + "junkie", + "laddie", + "laramie", + "lingerie", + "meanie", + "nightie", + "oldie", + "^pie", + "pixie", + "quickie", + "reverie", + "rookie", + "softie", + "sortie", + "stoolie", + "sweetie", + "techie", + "^tie", + "toughie", + "valkyrie", + "veggie", + "weenie", + "yuppie", + "zombie", +] +singular_s = plural_categories["s-singular"] + +# key plural, value singular +singular_irregular = { + "men": "man", + "people": "person", + "children": "child", + "sexes": "sex", + "axes": "axe", + "moves": "move", + "teeth": "tooth", + "geese": "goose", + "feet": "foot", + "zoa": "zoon", + "atlantes": "atlas", + "atlases": "atlas", + "beeves": "beef", + "brethren": "brother", + "corpora": "corpus", + "corpuses": "corpus", + "kine": "cow", + "ephemerides": "ephemeris", + "ganglia": "ganglion", + "genii": "genie", + "genera": "genus", + "graffiti": "graffito", + "helves": "helve", + "leaves": "leaf", + "loaves": "loaf", + "monies": "money", + "mongooses": "mongoose", + "mythoi": "mythos", + "octopodes": "octopus", + "opera": "opus", + "opuses": "opus", + "oxen": "ox", + "penes": "penis", + "penises": "penis", + "soliloquies": "soliloquy", + "testes": "testis", + "trilbys": "trilby", + "turves": "turf", + "numena": "numen", + "occipita": "occiput", + "our": "my", +} + + +def singularize(word, pos=NOUN, custom=None): + if custom is None: + custom = {} + if word in list(custom.keys()): + return custom[word] + + # Recursion of compound words (e.g. mothers-in-law). + if "-" in word: + words = word.split("-") + if len(words) > 1 and words[1] in plural_prepositions: + return singularize(words[0], pos, custom) + "-" + "-".join(words[1:]) + # dogs' => dog's + if word.endswith("'"): + return singularize(word[:-1]) + "'s" + + lower = word.lower() + for w in singular_uninflected: + if w.endswith(lower): + return word + for w in singular_uncountable: + if w.endswith(lower): + return word + for w in singular_ie: + if lower.endswith(w + "s"): + return w + for w in singular_s: + if lower.endswith(w + "es"): + return w + for w in list(singular_irregular.keys()): + if lower.endswith(w): + return re.sub("(?i)" + w + "$", singular_irregular[w], word) + + for rule in singular_rules: + suffix, inflection = rule + match = suffix.search(word) + if match: + groups = match.groups() + for k in range(0, len(groups)): + if groups[k] is None: + inflection = inflection.replace("\\" + str(k + 1), "") + return suffix.sub(inflection, word) + + return word diff --git a/textblob/en/np_extractors.py b/src/textblob/en/np_extractors.py similarity index 57% rename from textblob/en/np_extractors.py rename to src/textblob/en/np_extractors.py index f6f174ae..489d6da9 100644 --- a/textblob/en/np_extractors.py +++ b/src/textblob/en/np_extractors.py @@ -1,139 +1,144 @@ -# -*- coding: utf-8 -*- -'''Various noun phrase extractors.''' -from __future__ import unicode_literals, absolute_import +"""Various noun phrase extractors.""" import nltk -from textblob.taggers import PatternTagger -from textblob.decorators import requires_nltk_corpus -from textblob.utils import tree2str, filter_insignificant from textblob.base import BaseNPExtractor +from textblob.decorators import requires_nltk_corpus +from textblob.taggers import PatternTagger +from textblob.utils import filter_insignificant, tree2str class ChunkParser(nltk.ChunkParserI): - def __init__(self): self._trained = False @requires_nltk_corpus def train(self): - '''Train the Chunker on the ConLL-2000 corpus.''' - train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)] - for sent in - nltk.corpus.conll2000.chunked_sents('train.txt', - chunk_types=['NP'])] + """Train the Chunker on the ConLL-2000 corpus.""" + train_data = [ + [(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)] + for sent in nltk.corpus.conll2000.chunked_sents( + "train.txt", chunk_types=["NP"] + ) + ] unigram_tagger = nltk.UnigramTagger(train_data) self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True def parse(self, sentence): - '''Return the parse tree for the sentence.''' + """Return the parse tree for the sentence.""" if not self._trained: self.train() pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] - conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in - zip(sentence, chunktags)] + conlltags = [ + (word, pos, chunktag) + for ((word, pos), chunktag) in zip(sentence, chunktags) + ] return nltk.chunk.util.conlltags2tree(conlltags) class ConllExtractor(BaseNPExtractor): - '''A noun phrase extractor that uses chunk parsing trained with the + """A noun phrase extractor that uses chunk parsing trained with the ConLL-2000 training corpus. - ''' + """ POS_TAGGER = PatternTagger() # The context-free grammar with which to filter the noun phrases CFG = { - ('NNP', 'NNP'): 'NNP', - ('NN', 'NN'): 'NNI', - ('NNI', 'NN'): 'NNI', - ('JJ', 'JJ'): 'JJ', - ('JJ', 'NN'): 'NNI', - } + ("NNP", "NNP"): "NNP", + ("NN", "NN"): "NNI", + ("NNI", "NN"): "NNI", + ("JJ", "JJ"): "JJ", + ("JJ", "NN"): "NNI", + } # POS suffixes that will be ignored - INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP'] + INSIGNIFICANT_SUFFIXES = ["DT", "CC", "PRP$", "PRP"] def __init__(self, parser=None): self.parser = ChunkParser() if not parser else parser def extract(self, text): - '''Return a list of noun phrases (strings) for body of text.''' + """Return a list of noun phrases (strings) for body of text.""" sentences = nltk.tokenize.sent_tokenize(text) noun_phrases = [] for sentence in sentences: parsed = self._parse_sentence(sentence) # Get the string representation of each subtree that is a # noun phrase tree - phrases = [_normalize_tags(filter_insignificant(each, - self.INSIGNIFICANT_SUFFIXES)) for each in parsed - if isinstance(each, nltk.tree.Tree) and each.label() - == 'NP' and len(filter_insignificant(each)) >= 1 - and _is_match(each, cfg=self.CFG)] + phrases = [ + _normalize_tags(filter_insignificant(each, self.INSIGNIFICANT_SUFFIXES)) + for each in parsed + if isinstance(each, nltk.tree.Tree) + and each.label() == "NP" + and len(filter_insignificant(each)) >= 1 + and _is_match(each, cfg=self.CFG) + ] nps = [tree2str(phrase) for phrase in phrases] noun_phrases.extend(nps) return noun_phrases def _parse_sentence(self, sentence): - '''Tag and parse a sentence (a plain, untagged string).''' + """Tag and parse a sentence (a plain, untagged string).""" tagged = self.POS_TAGGER.tag(sentence) return self.parser.parse(tagged) class FastNPExtractor(BaseNPExtractor): - '''A fast and simple noun phrase extractor. + """A fast and simple noun phrase extractor. Credit to Shlomi Babluk. Link to original blog post: http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/ - ''' + """ CFG = { - ('NNP', 'NNP'): 'NNP', - ('NN', 'NN'): 'NNI', - ('NNI', 'NN'): 'NNI', - ('JJ', 'JJ'): 'JJ', - ('JJ', 'NN'): 'NNI', - } + ("NNP", "NNP"): "NNP", + ("NN", "NN"): "NNI", + ("NNI", "NN"): "NNI", + ("JJ", "JJ"): "JJ", + ("JJ", "NN"): "NNI", + } def __init__(self): self._trained = False @requires_nltk_corpus def train(self): - train_data = nltk.corpus.brown.tagged_sents(categories='news') - regexp_tagger = nltk.RegexpTagger([ - (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), - (r'(-|:|;)$', ':'), - (r'\'*$', 'MD'), - (r'(The|the|A|a|An|an)$', 'AT'), - (r'.*able$', 'JJ'), - (r'^[A-Z].*$', 'NNP'), - (r'.*ness$', 'NN'), - (r'.*ly$', 'RB'), - (r'.*s$', 'NNS'), - (r'.*ing$', 'VBG'), - (r'.*ed$', 'VBD'), - (r'.*', 'NN'), - ]) + train_data = nltk.corpus.brown.tagged_sents(categories="news") + regexp_tagger = nltk.RegexpTagger( + [ + (r"^-?[0-9]+(.[0-9]+)?$", "CD"), + (r"(-|:|;)$", ":"), + (r"\'*$", "MD"), + (r"(The|the|A|a|An|an)$", "AT"), + (r".*able$", "JJ"), + (r"^[A-Z].*$", "NNP"), + (r".*ness$", "NN"), + (r".*ly$", "RB"), + (r".*s$", "NNS"), + (r".*ing$", "VBG"), + (r".*ed$", "VBD"), + (r".*", "NN"), + ] + ) unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger) self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True return None - def _tokenize_sentence(self, sentence): - '''Split the sentence into single words/tokens''' + """Split the sentence into single words/tokens""" tokens = nltk.word_tokenize(sentence) return tokens def extract(self, sentence): - '''Return a list of noun phrases (strings) for body of text.''' + """Return a list of noun phrases (strings) for body of text.""" if not self._trained: self.train() tokens = self._tokenize_sentence(sentence) @@ -146,35 +151,36 @@ def extract(self, sentence): t1 = tags[x] t2 = tags[x + 1] key = t1[1], t2[1] - value = self.CFG.get(key, '') + value = self.CFG.get(key, "") if value: merge = True tags.pop(x) tags.pop(x) - match = '%s %s' % (t1[0], t2[0]) + match = f"{t1[0]} {t2[0]}" pos = value tags.insert(x, (match, pos)) break - matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']] + matches = [t[0] for t in tags if t[1] in ["NNP", "NNI"]] return matches ### Utility methods ### + def _normalize_tags(chunk): - '''Normalize the corpus tags. + """Normalize the corpus tags. ("NN", "NN-PL", "NNS") -> "NN" - ''' + """ ret = [] for word, tag in chunk: - if tag == 'NP-TL' or tag == 'NP': - ret.append((word, 'NNP')) + if tag == "NP-TL" or tag == "NP": + ret.append((word, "NNP")) continue - if tag.endswith('-TL'): + if tag.endswith("-TL"): ret.append((word, tag[:-3])) continue - if tag.endswith('S'): + if tag.endswith("S"): ret.append((word, tag[:-1])) continue ret.append((word, tag)) @@ -182,8 +188,7 @@ def _normalize_tags(chunk): def _is_match(tagged_phrase, cfg): - '''Return whether or not a tagged phrases matches a context-free grammar. - ''' + """Return whether or not a tagged phrases matches a context-free grammar.""" copy = list(tagged_phrase) # A copy of the list merge = True while merge: @@ -196,9 +201,9 @@ def _is_match(tagged_phrase, cfg): merge = True copy.pop(i) copy.pop(i) - match = '{0} {1}'.format(first[0], second[0]) + match = f"{first[0]} {second[0]}" pos = value copy.insert(i, (match, pos)) break - match = any([t[1] in ('NNP', 'NNI') for t in copy]) + match = any([t[1] in ("NNP", "NNI") for t in copy]) return match diff --git a/textblob/en/parsers.py b/src/textblob/en/parsers.py similarity index 86% rename from textblob/en/parsers.py rename to src/textblob/en/parsers.py index d1678d2a..63a0b29c 100644 --- a/textblob/en/parsers.py +++ b/src/textblob/en/parsers.py @@ -1,11 +1,9 @@ -# -*- coding: utf-8 -*- """Various parser implementations. .. versionadded:: 0.6.0 """ -from __future__ import absolute_import -from textblob.en import parse as pattern_parse from textblob.base import BaseParser +from textblob.en import parse as pattern_parse class PatternParser(BaseParser): diff --git a/textblob/en/sentiments.py b/src/textblob/en/sentiments.py similarity index 70% rename from textblob/en/sentiments.py rename to src/textblob/en/sentiments.py index e5106bf9..40da6c0e 100644 --- a/textblob/en/sentiments.py +++ b/src/textblob/en/sentiments.py @@ -1,17 +1,15 @@ -# -*- coding: utf-8 -*- """Sentiment analysis implementations. .. versionadded:: 0.5.0 """ -from __future__ import absolute_import from collections import namedtuple import nltk +from textblob.base import CONTINUOUS, DISCRETE, BaseSentimentAnalyzer +from textblob.decorators import requires_nltk_corpus from textblob.en import sentiment as pattern_sentiment from textblob.tokenizers import word_tokenize -from textblob.decorators import requires_nltk_corpus -from textblob.base import BaseSentimentAnalyzer, DISCRETE, CONTINUOUS class PatternAnalyzer(BaseSentimentAnalyzer): @@ -23,10 +21,11 @@ class PatternAnalyzer(BaseSentimentAnalyzer): where [assessments] is a list of the assessed tokens and their polarity and subjectivity scores """ + kind = CONTINUOUS # This is only here for backwards-compatibility. # The return type is actually determined upon calling analyze() - RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity']) + RETURN_TYPE = namedtuple("Sentiment", ["polarity", "subjectivity"]) def analyze(self, text, keep_assessments=False): """Return the sentiment as a named tuple of the form: @@ -34,19 +33,21 @@ def analyze(self, text, keep_assessments=False): """ #: Return type declaration if keep_assessments: - Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments']) + Sentiment = namedtuple( + "Sentiment", ["polarity", "subjectivity", "assessments"] + ) assessments = pattern_sentiment(text).assessments polarity, subjectivity = pattern_sentiment(text) return Sentiment(polarity, subjectivity, assessments) else: - Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity']) + Sentiment = namedtuple("Sentiment", ["polarity", "subjectivity"]) return Sentiment(*pattern_sentiment(text)) def _default_feature_extractor(words): """Default feature extractor for the NaiveBayesAnalyzer.""" - return dict(((word, True) for word in words)) + return dict((word, True) for word in words) class NaiveBayesAnalyzer(BaseSentimentAnalyzer): @@ -60,23 +61,33 @@ class NaiveBayesAnalyzer(BaseSentimentAnalyzer): kind = DISCRETE #: Return type declaration - RETURN_TYPE = namedtuple('Sentiment', ['classification', 'p_pos', 'p_neg']) + RETURN_TYPE = namedtuple("Sentiment", ["classification", "p_pos", "p_neg"]) def __init__(self, feature_extractor=_default_feature_extractor): - super(NaiveBayesAnalyzer, self).__init__() + super().__init__() self._classifier = None self.feature_extractor = feature_extractor @requires_nltk_corpus def train(self): """Train the Naive Bayes classifier on the movie review corpus.""" - super(NaiveBayesAnalyzer, self).train() - neg_ids = nltk.corpus.movie_reviews.fileids('neg') - pos_ids = nltk.corpus.movie_reviews.fileids('pos') - neg_feats = [(self.feature_extractor( - nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids] - pos_feats = [(self.feature_extractor( - nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids] + super().train() + neg_ids = nltk.corpus.movie_reviews.fileids("neg") + pos_ids = nltk.corpus.movie_reviews.fileids("pos") + neg_feats = [ + ( + self.feature_extractor(nltk.corpus.movie_reviews.words(fileids=[f])), + "neg", + ) + for f in neg_ids + ] + pos_feats = [ + ( + self.feature_extractor(nltk.corpus.movie_reviews.words(fileids=[f])), + "pos", + ) + for f in pos_ids + ] train_data = neg_feats + pos_feats self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data) @@ -85,13 +96,13 @@ def analyze(self, text): ``Sentiment(classification, p_pos, p_neg)`` """ # Lazily train the classifier - super(NaiveBayesAnalyzer, self).analyze(text) + super().analyze(text) tokens = word_tokenize(text, include_punc=False) filtered = (t.lower() for t in tokens if len(t) >= 3) feats = self.feature_extractor(filtered) prob_dist = self._classifier.prob_classify(feats) return self.RETURN_TYPE( classification=prob_dist.max(), - p_pos=prob_dist.prob('pos'), - p_neg=prob_dist.prob("neg") + p_pos=prob_dist.prob("pos"), + p_neg=prob_dist.prob("neg"), ) diff --git a/textblob/en/taggers.py b/src/textblob/en/taggers.py similarity index 80% rename from textblob/en/taggers.py rename to src/textblob/en/taggers.py index 65e30629..c8b0c169 100644 --- a/textblob/en/taggers.py +++ b/src/textblob/en/taggers.py @@ -1,14 +1,11 @@ -# -*- coding: utf-8 -*- """Parts-of-speech tagger implementations.""" -from __future__ import absolute_import import nltk -import textblob.compat import textblob as tb -from textblob.en import tag as pattern_tag -from textblob.decorators import requires_nltk_corpus from textblob.base import BaseTagger +from textblob.decorators import requires_nltk_corpus +from textblob.en import tag as pattern_tag class PatternTagger(BaseTagger): @@ -19,7 +16,7 @@ class PatternTagger(BaseTagger): def tag(self, text, tokenize=True): """Tag a string or BaseBlob.""" - if not isinstance(text, textblob.compat.text_type): + if not isinstance(text, str): text = text.raw return pattern_tag(text, tokenize) @@ -32,7 +29,7 @@ class NLTKTagger(BaseTagger): @requires_nltk_corpus def tag(self, text): """Tag a string or BaseBlob.""" - if isinstance(text, textblob.compat.text_type): + if isinstance(text, str): text = tb.TextBlob(text) return nltk.tag.pos_tag(text.tokens) diff --git a/textblob/exceptions.py b/src/textblob/exceptions.py similarity index 93% rename from textblob/exceptions.py rename to src/textblob/exceptions.py index 004c41e1..26105376 100644 --- a/textblob/exceptions.py +++ b/src/textblob/exceptions.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - MISSING_CORPUS_MESSAGE = """ Looks like you are missing some required data for this feature. @@ -11,38 +9,49 @@ If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues. """ + class TextBlobError(Exception): """A TextBlob-related error.""" + pass TextBlobException = TextBlobError # Backwards compat + class MissingCorpusError(TextBlobError): """Exception thrown when a user tries to use a feature that requires a dataset or model that the user does not have on their system. """ def __init__(self, message=MISSING_CORPUS_MESSAGE, *args, **kwargs): - super(MissingCorpusError, self).__init__(message, *args, **kwargs) + super().__init__(message, *args, **kwargs) MissingCorpusException = MissingCorpusError # Backwards compat + class DeprecationError(TextBlobError): """Raised when user uses a deprecated feature.""" + pass + class TranslatorError(TextBlobError): """Raised when an error occurs during language translation or detection.""" + pass + class NotTranslated(TranslatorError): """Raised when text is unchanged after translation. This may be due to the language being unsupported by the translator. """ + pass + class FormatError(TextBlobError): """Raised if a data file with an unsupported format is passed to a classifier.""" + pass diff --git a/textblob/formats.py b/src/textblob/formats.py similarity index 88% rename from textblob/formats.py rename to src/textblob/formats.py index 7aa5083f..312bc997 100644 --- a/textblob/formats.py +++ b/src/textblob/formats.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """File formats for training and testing data. Includes a registry of valid file formats. New file formats can be added to the @@ -19,16 +18,16 @@ class PipeDelimitedFormat(formats.DelimitedFormat): with open('training_data.psv', 'r') as fp: cl = NaiveBayesAnalyzer(fp, format='psv') """ -from __future__ import absolute_import +import csv import json from collections import OrderedDict -from textblob.compat import PY2, csv from textblob.utils import is_filelike -DEFAULT_ENCODING = 'utf-8' +DEFAULT_ENCODING = "utf-8" -class BaseFormat(object): + +class BaseFormat: """Interface for format classes. Individual formats can decide on the composition and meaning of ``**kwargs``. @@ -37,6 +36,7 @@ class BaseFormat(object): .. versionchanged:: 0.9.0 Constructor receives a file pointer rather than a file path. """ + def __init__(self, fp, **kwargs): pass @@ -54,6 +54,7 @@ def detect(cls, stream): """ raise NotImplementedError('Must implement a "detect" class method.') + class DelimitedFormat(BaseFormat): """A general character-delimited format.""" @@ -61,11 +62,7 @@ class DelimitedFormat(BaseFormat): def __init__(self, fp, **kwargs): BaseFormat.__init__(self, fp, **kwargs) - if PY2: - reader = csv.reader(fp, delimiter=self.delimiter, - encoding=DEFAULT_ENCODING) - else: - reader = csv.reader(fp, delimiter=self.delimiter) + reader = csv.reader(fp, delimiter=self.delimiter) self.data = [row for row in reader] def to_iterable(self): @@ -89,12 +86,13 @@ class CSV(DelimitedFormat): Today is a good day,pos I hate this car.,pos """ + delimiter = "," class TSV(DelimitedFormat): - """TSV format. Assumes each row is of the form ``text\tlabel``. - """ + """TSV format. Assumes each row is of the form ``text\tlabel``.""" + delimiter = "\t" @@ -110,13 +108,14 @@ class JSON(BaseFormat): {"text": "I hate this car.", "label": "neg"} ] """ + def __init__(self, fp, **kwargs): BaseFormat.__init__(self, fp, **kwargs) self.dict = json.load(fp) def to_iterable(self): """Return an iterable object from the JSON data.""" - return [(d['text'], d['label']) for d in self.dict] + return [(d["text"], d["label"]) for d in self.dict] @classmethod def detect(cls, stream): @@ -128,11 +127,14 @@ def detect(cls, stream): return False -_registry = OrderedDict([ - ('csv', CSV), - ('json', JSON), - ('tsv', TSV), -]) +_registry = OrderedDict( + [ + ("csv", CSV), + ("json", JSON), + ("tsv", TSV), + ] +) + def detect(fp, max_read=1024): """Attempt to detect a file's format, trying each of the supported @@ -148,10 +150,12 @@ def detect(fp, max_read=1024): fp.seek(0) return None + def get_registry(): """Return a dictionary of registered formats.""" return _registry + def register(name, format_class): """Register a new format. diff --git a/src/textblob/inflect.py b/src/textblob/inflect.py new file mode 100644 index 00000000..65ac3334 --- /dev/null +++ b/src/textblob/inflect.py @@ -0,0 +1,15 @@ +"""Make word inflection default to English. This allows for backwards +compatibility so you can still import text.inflect. + + >>> from textblob.inflect import singularize + +is equivalent to + + >>> from textblob.en.inflect import singularize +""" +from textblob.en.inflect import pluralize, singularize + +__all__ = [ + "singularize", + "pluralize", +] diff --git a/textblob/mixins.py b/src/textblob/mixins.py similarity index 75% rename from textblob/mixins.py rename to src/textblob/mixins.py index 377fe1fa..b3a134a5 100644 --- a/textblob/mixins.py +++ b/src/textblob/mixins.py @@ -1,12 +1,9 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import sys -from textblob.compat import basestring, implements_to_string, PY2, binary_type -class ComparableMixin(object): +class ComparableMixin: - '''Implements rich operators for an object.''' + """Implements rich operators for an object.""" def _compare(self, other, method): try: @@ -37,57 +34,54 @@ def __ne__(self, other): class BlobComparableMixin(ComparableMixin): - '''Allow blob objects to be comparable with both strings and blobs.''' + """Allow blob objects to be comparable with both strings and blobs.""" def _compare(self, other, method): - if isinstance(other, basestring): + if isinstance(other, (str, bytes)): # Just compare with the other string return method(self._cmpkey(), other) - return super(BlobComparableMixin, self)._compare(other, method) + return super()._compare(other, method) -@implements_to_string -class StringlikeMixin(object): +class StringlikeMixin: - '''Make blob objects behave like Python strings. + """Make blob objects behave like Python strings. Expects that classes that use this mixin to have a _strkey() method that returns the string to apply string methods to. Using _strkey() instead of __str__ ensures consistent behavior between Python 2 and 3. - ''' + """ def __repr__(self): - '''Returns a string representation for debugging.''' + """Returns a string representation for debugging.""" class_name = self.__class__.__name__ - text = self.__unicode__().encode("utf-8") if PY2 else str(self) - ret = '{cls}("{text}")'.format(cls=class_name, - text=text) - return binary_type(ret) if PY2 else ret + text = str(self) + return f'{class_name}("{text}")' def __str__(self): - '''Returns a string representation used in print statements - or str(my_blob).''' + """Returns a string representation used in print statements + or str(my_blob).""" return self._strkey() def __len__(self): - '''Returns the length of the raw text.''' + """Returns the length of the raw text.""" return len(self._strkey()) def __iter__(self): - '''Makes the object iterable as if it were a string, + """Makes the object iterable as if it were a string, iterating through the raw string's characters. - ''' + """ return iter(self._strkey()) def __contains__(self, sub): - '''Implements the `in` keyword like a Python string.''' + """Implements the `in` keyword like a Python string.""" return sub in self._strkey() def __getitem__(self, index): - '''Returns a substring. If index is an integer, returns a Python + """Returns a substring. If index is an integer, returns a Python string of a single character. If a range is given, e.g. `blob[3:5]`, a new instance of the class is returned. - ''' + """ if isinstance(index, int): return self._strkey()[index] # Just return a single character else: @@ -95,29 +89,29 @@ def __getitem__(self, index): return self.__class__(self._strkey()[index]) def find(self, sub, start=0, end=sys.maxsize): - '''Behaves like the built-in str.find() method. Returns an integer, + """Behaves like the built-in str.find() method. Returns an integer, the index of the first occurrence of the substring argument sub in the sub-string given by [start:end]. - ''' + """ return self._strkey().find(sub, start, end) def rfind(self, sub, start=0, end=sys.maxsize): - '''Behaves like the built-in str.rfind() method. Returns an integer, + """Behaves like the built-in str.rfind() method. Returns an integer, the index of he last (right-most) occurence of the substring argument sub in the sub-sequence given by [start:end]. - ''' + """ return self._strkey().rfind(sub, start, end) def index(self, sub, start=0, end=sys.maxsize): - '''Like blob.find() but raise ValueError when the substring + """Like blob.find() but raise ValueError when the substring is not found. - ''' + """ return self._strkey().index(sub, start, end) def rindex(self, sub, start=0, end=sys.maxsize): - '''Like blob.rfind() but raise ValueError when substring is not + """Like blob.rfind() but raise ValueError when substring is not found. - ''' + """ return self._strkey().rindex(sub, start, end) def startswith(self, prefix, start=0, end=sys.maxsize): @@ -143,8 +137,7 @@ def format(self, *args, **kwargs): return self.__class__(self._strkey().format(*args, **kwargs)) def split(self, sep=None, maxsplit=sys.maxsize): - """Behaves like the built-in str.split(). - """ + """Behaves like the built-in str.split().""" return self._strkey().split(sep, maxsplit) def strip(self, chars=None): @@ -154,13 +147,11 @@ def strip(self, chars=None): return self.__class__(self._strkey().strip(chars)) def upper(self): - """Like str.upper(), returns new object with all upper-cased characters. - """ + """Like str.upper(), returns new object with all upper-cased characters.""" return self.__class__(self._strkey().upper()) def lower(self): - """Like str.lower(), returns new object with all lower-cased characters. - """ + """Like str.lower(), returns new object with all lower-cased characters.""" return self.__class__(self._strkey().lower()) def join(self, iterable): diff --git a/textblob/np_extractors.py b/src/textblob/np_extractors.py similarity index 74% rename from textblob/np_extractors.py rename to src/textblob/np_extractors.py index ea80c959..13bbd7e3 100644 --- a/textblob/np_extractors.py +++ b/src/textblob/np_extractors.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Default noun phrase extractors are for English to maintain backwards compatibility, so you can still do @@ -8,12 +7,11 @@ >>> from textblob.en.np_extractors import ConllExtractor """ -from __future__ import absolute_import from textblob.base import BaseNPExtractor from textblob.en.np_extractors import ConllExtractor, FastNPExtractor __all__ = [ - 'BaseNPExtractor', - 'ConllExtractor', - 'FastNPExtractor', + "BaseNPExtractor", + "ConllExtractor", + "FastNPExtractor", ] diff --git a/textblob/parsers.py b/src/textblob/parsers.py similarity index 58% rename from textblob/parsers.py rename to src/textblob/parsers.py index 49884219..83f6d506 100644 --- a/textblob/parsers.py +++ b/src/textblob/parsers.py @@ -1,17 +1,15 @@ -# -*- coding: utf-8 -*- -'''Default parsers to English for backwards compatibility so you can still do +"""Default parsers to English for backwards compatibility so you can still do >>> from textblob.parsers import PatternParser which is equivalent to >>> from textblob.en.parsers import PatternParser -''' -from __future__ import absolute_import +""" from textblob.base import BaseParser from textblob.en.parsers import PatternParser __all__ = [ - 'BaseParser', - 'PatternParser', + "BaseParser", + "PatternParser", ] diff --git a/src/textblob/sentiments.py b/src/textblob/sentiments.py new file mode 100644 index 00000000..0c855679 --- /dev/null +++ b/src/textblob/sentiments.py @@ -0,0 +1,24 @@ +"""Default sentiment analyzers are English for backwards compatibility, so +you can still do + +>>> from textblob.sentiments import PatternAnalyzer + +which is equivalent to + +>>> from textblob.en.sentiments import PatternAnalyzer +""" +from textblob.base import BaseSentimentAnalyzer +from textblob.en.sentiments import ( + CONTINUOUS, + DISCRETE, + NaiveBayesAnalyzer, + PatternAnalyzer, +) + +__all__ = [ + "BaseSentimentAnalyzer", + "DISCRETE", + "CONTINUOUS", + "PatternAnalyzer", + "NaiveBayesAnalyzer", +] diff --git a/src/textblob/taggers.py b/src/textblob/taggers.py new file mode 100644 index 00000000..6a861ceb --- /dev/null +++ b/src/textblob/taggers.py @@ -0,0 +1,17 @@ +"""Default taggers to the English taggers for backwards incompatibility, so you +can still do + +>>> from textblob.taggers import NLTKTagger + +which is equivalent to + +>>> from textblob.en.taggers import NLTKTagger +""" +from textblob.base import BaseTagger +from textblob.en.taggers import NLTKTagger, PatternTagger + +__all__ = [ + "BaseTagger", + "PatternTagger", + "NLTKTagger", +] diff --git a/textblob/tokenizers.py b/src/textblob/tokenizers.py similarity index 75% rename from textblob/tokenizers.py rename to src/textblob/tokenizers.py index ce2f7f46..d5adea10 100644 --- a/textblob/tokenizers.py +++ b/src/textblob/tokenizers.py @@ -1,16 +1,14 @@ -# -*- coding: utf-8 -*- -'''Various tokenizer implementations. +"""Various tokenizer implementations. .. versionadded:: 0.4.0 -''' -from __future__ import absolute_import +""" from itertools import chain import nltk -from textblob.utils import strip_punc from textblob.base import BaseTokenizer from textblob.decorators import requires_nltk_corpus +from textblob.utils import strip_punc class WordTokenizer(BaseTokenizer): @@ -26,11 +24,12 @@ class WordTokenizer(BaseTokenizer): """ def tokenize(self, text, include_punc=True): - '''Return a list of word tokens. + """Return a list of word tokens. :param text: string of text. - :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. - ''' + :param include_punc: (optional) whether to + include punctuation as separate tokens. Default to True. + """ tokens = nltk.tokenize.word_tokenize(text) if include_punc: return tokens @@ -40,8 +39,11 @@ def tokenize(self, text, include_punc=True): # e.g. "Let's" => ["Let", "'s"] # e.g. "Can't" => ["Ca", "n't"] # e.g. "home." => ['home'] - return [word if word.startswith("'") else strip_punc(word, all=False) - for word in tokens if strip_punc(word, all=False)] + return [ + word if word.startswith("'") else strip_punc(word, all=False) + for word in tokens + if strip_punc(word, all=False) + ] class SentenceTokenizer(BaseTokenizer): @@ -53,7 +55,7 @@ class SentenceTokenizer(BaseTokenizer): @requires_nltk_corpus def tokenize(self, text): - '''Return a list of sentences.''' + """Return a list of sentences.""" return nltk.tokenize.sent_tokenize(text) @@ -61,6 +63,8 @@ def tokenize(self, text): sent_tokenize = SentenceTokenizer().itokenize _word_tokenizer = WordTokenizer() # Singleton word tokenizer + + def word_tokenize(text, include_punc=True, *args, **kwargs): """Convenience function for tokenizing text into words. @@ -68,7 +72,7 @@ def word_tokenize(text, include_punc=True, *args, **kwargs): tokenized to sentences before being tokenized to words. """ words = chain.from_iterable( - _word_tokenizer.itokenize(sentence, include_punc=include_punc, - *args, **kwargs) - for sentence in sent_tokenize(text)) + _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) + for sentence in sent_tokenize(text) + ) return words diff --git a/textblob/unicodecsv/__init__.py b/src/textblob/unicodecsv/__init__.py similarity index 61% rename from textblob/unicodecsv/__init__.py rename to src/textblob/unicodecsv/__init__.py index 752f403f..b32470f3 100644 --- a/textblob/unicodecsv/__init__.py +++ b/src/textblob/unicodecsv/__init__.py @@ -1,54 +1,56 @@ -# -*- coding: utf-8 -*- import csv -from textblob.compat import izip -#http://semver.org/ + +# http://semver.org/ VERSION = (0, 9, 4) -__version__ = ".".join(map(str,VERSION)) +__version__ = ".".join(map(str, VERSION)) pass_throughs = [ - 'register_dialect', - 'unregister_dialect', - 'get_dialect', - 'list_dialects', - 'field_size_limit', - 'Dialect', - 'excel', - 'excel_tab', - 'Sniffer', - 'QUOTE_ALL', - 'QUOTE_MINIMAL', - 'QUOTE_NONNUMERIC', - 'QUOTE_NONE', - 'Error' + "register_dialect", + "unregister_dialect", + "get_dialect", + "list_dialects", + "field_size_limit", + "Dialect", + "excel", + "excel_tab", + "Sniffer", + "QUOTE_ALL", + "QUOTE_MINIMAL", + "QUOTE_NONNUMERIC", + "QUOTE_NONE", + "Error", ] __all__ = [ - 'reader', - 'writer', - 'DictReader', - 'DictWriter', + "reader", + "writer", + "DictReader", + "DictWriter", ] + pass_throughs for prop in pass_throughs: - globals()[prop]=getattr(csv, prop) + globals()[prop] = getattr(csv, prop) + def _stringify(s, encoding, errors): if s is None: - return '' + return "" if isinstance(s, unicode): return s.encode(encoding, errors) - elif isinstance(s, (int , float)): - pass #let csv.QUOTE_NONNUMERIC do its thing. + elif isinstance(s, (int, float)): + pass # let csv.QUOTE_NONNUMERIC do its thing. elif not isinstance(s, str): - s=str(s) + s = str(s) return s -def _stringify_list(l, encoding, errors='strict'): + +def _stringify_list(l, encoding, errors="strict"): try: return [_stringify(s, encoding, errors) for s in iter(l)] except TypeError as e: raise csv.Error(str(e)) + def _unicodify(s, encoding): if s is None: return None @@ -58,7 +60,8 @@ def _unicodify(s, encoding): return s.decode(encoding) return s -class UnicodeWriter(object): + +class UnicodeWriter: """ >>> import unicodecsv >>> from cStringIO import StringIO @@ -73,8 +76,10 @@ class UnicodeWriter(object): >>> row[1] == u'ñ' True """ - def __init__(self, f, dialect=csv.excel, encoding='utf-8', errors='strict', - *args, **kwds): + + def __init__( + self, f, dialect=csv.excel, encoding="utf-8", errors="strict", *args, **kwds + ): self.encoding = encoding self.writer = csv.writer(f, dialect, *args, **kwds) self.encoding_errors = errors @@ -84,17 +89,27 @@ def writerow(self, row): def writerows(self, rows): for row in rows: - self.writerow(row) + self.writerow(row) @property def dialect(self): return self.writer.dialect + + writer = UnicodeWriter -class UnicodeReader(object): - def __init__(self, f, dialect=None, encoding='utf-8', errors='strict', - **kwds): - format_params = ['delimiter', 'doublequote', 'escapechar', 'lineterminator', 'quotechar', 'quoting', 'skipinitialspace'] + +class UnicodeReader: + def __init__(self, f, dialect=None, encoding="utf-8", errors="strict", **kwds): + format_params = [ + "delimiter", + "doublequote", + "escapechar", + "lineterminator", + "quotechar", + "quoting", + "skipinitialspace", + ] if dialect is None: if not any([kwd_name in format_params for kwd_name in kwds.keys()]): dialect = csv.excel @@ -108,8 +123,14 @@ def next(self): encoding_errors = self.encoding_errors float_ = float unicode_ = unicode - return [(value if isinstance(value, float_) else - unicode_(value, encoding, encoding_errors)) for value in row] + return [ + ( + value + if isinstance(value, float_) + else unicode_(value, encoding, encoding_errors) + ) + for value in row + ] def __iter__(self): return self @@ -121,8 +142,11 @@ def dialect(self): @property def line_num(self): return self.reader.line_num + + reader = UnicodeReader + class DictWriter(csv.DictWriter): """ >>> from cStringIO import StringIO @@ -140,17 +164,34 @@ class DictWriter(csv.DictWriter): >>> r.next() == {'a': u'\xc3\xa9', u'ñ':'2', 'r': [u'\xc3\xae']} True """ - def __init__(self, csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', encoding='utf-8', errors='strict', *args, **kwds): + + def __init__( + self, + csvfile, + fieldnames, + restval="", + extrasaction="raise", + dialect="excel", + encoding="utf-8", + errors="strict", + *args, + **kwds, + ): self.encoding = encoding - csv.DictWriter.__init__(self, csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds) - self.writer = UnicodeWriter(csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds) + csv.DictWriter.__init__( + self, csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds + ) + self.writer = UnicodeWriter( + csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds + ) self.encoding_errors = errors def writeheader(self): - fieldnames = _stringify_list(self.fieldnames, self.encoding, self.encoding_errors) + _stringify_list(self.fieldnames, self.encoding, self.encoding_errors) header = dict(zip(self.fieldnames, self.fieldnames)) self.writerow(header) + class DictReader(csv.DictReader): """ >>> from cStringIO import StringIO @@ -168,26 +209,40 @@ class DictReader(csv.DictReader): >>> print r.next() == {'name': u'Willam ø. Unicoder', 'place': u'éSpandland'} True """ - def __init__(self, csvfile, fieldnames=None, restkey=None, restval=None, - dialect='excel', encoding='utf-8', errors='strict', *args, - **kwds): + + def __init__( + self, + csvfile, + fieldnames=None, + restkey=None, + restval=None, + dialect="excel", + encoding="utf-8", + errors="strict", + *args, + **kwds, + ): if fieldnames is not None: fieldnames = _stringify_list(fieldnames, encoding) - csv.DictReader.__init__(self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds) - self.reader = UnicodeReader(csvfile, dialect, encoding=encoding, - errors=errors, *args, **kwds) - if fieldnames is None and not hasattr(csv.DictReader, 'fieldnames'): + csv.DictReader.__init__( + self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds + ) + self.reader = UnicodeReader( + csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds + ) + if fieldnames is None and not hasattr(csv.DictReader, "fieldnames"): # Python 2.5 fieldnames workaround. (http://bugs.python.org/issue3436) reader = UnicodeReader(csvfile, dialect, encoding=encoding, *args, **kwds) self.fieldnames = _stringify_list(reader.next(), reader.encoding) - self.unicode_fieldnames = [_unicodify(f, encoding) for f in - self.fieldnames] + self.unicode_fieldnames = [_unicodify(f, encoding) for f in self.fieldnames] self.unicode_restkey = _unicodify(restkey, encoding) def next(self): row = csv.DictReader.next(self) - result = dict((uni_key, row[str_key]) for (str_key, uni_key) in - izip(self.fieldnames, self.unicode_fieldnames)) + result = dict( + (uni_key, row[str_key]) + for (str_key, uni_key) in zip(self.fieldnames, self.unicode_fieldnames) + ) rest = row.get(self.restkey) if rest: result[self.unicode_restkey] = rest diff --git a/textblob/utils.py b/src/textblob/utils.py similarity index 80% rename from textblob/utils.py rename to src/textblob/utils.py index c0646938..7be12c9e 100644 --- a/textblob/utils.py +++ b/src/textblob/utils.py @@ -1,8 +1,7 @@ -# -*- coding: utf-8 -*- import re import string -PUNCTUATION_REGEX = re.compile('[{0}]'.format(re.escape(string.punctuation))) +PUNCTUATION_REGEX = re.compile(f"[{re.escape(string.punctuation)}]") def strip_punc(s, all=False): @@ -13,7 +12,7 @@ def strip_punc(s, all=False): the ends of the string. """ if all: - return PUNCTUATION_REGEX.sub('', s.strip()) + return PUNCTUATION_REGEX.sub("", s.strip()) else: return s.strip().strip(string.punctuation) @@ -28,7 +27,7 @@ def lowerstrip(s, all=False): return strip_punc(s.lower().strip(), all=all) -def tree2str(tree, concat=' '): +def tree2str(tree, concat=" "): """Convert a nltk.tree.Tree to a string. For example: @@ -37,7 +36,7 @@ def tree2str(tree, concat=' '): return concat.join([word for (word, tag) in tree]) -def filter_insignificant(chunk, tag_suffixes=('DT', 'CC', 'PRP$', 'PRP')): +def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")): """Filter out insignificant (word, tag) tuples from a chunk of text.""" good = [] for word, tag in chunk: @@ -53,4 +52,4 @@ def filter_insignificant(chunk, tag_suffixes=('DT', 'CC', 'PRP$', 'PRP')): def is_filelike(obj): """Return whether ``obj`` is a file-like object.""" - return hasattr(obj, 'read') + return hasattr(obj, "read") diff --git a/textblob/wordnet.py b/src/textblob/wordnet.py similarity index 94% rename from textblob/wordnet.py rename to src/textblob/wordnet.py index 4c89b6bf..71486ff3 100644 --- a/textblob/wordnet.py +++ b/src/textblob/wordnet.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Wordnet interface. Contains classes for creating Synsets and Lemmas directly. diff --git a/tasks.py b/tasks.py deleted file mode 100644 index cac9064d..00000000 --- a/tasks.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import os -import webbrowser - -from invoke import task - -docs_dir = 'docs' -build_dir = os.path.join(docs_dir, '_build') - -@task -def test(ctx): - ctx.run("python run_tests.py", pty=True) - - -@task -def clean(ctx): - ctx.run("rm -rf build") - ctx.run("rm -rf dist") - ctx.run("rm -rf textblob.egg-info") - clean_docs(ctx) - print("Cleaned up.") - -@task -def clean_docs(ctx): - ctx.run("rm -rf %s" % build_dir) - - -@task -def browse_docs(ctx): - path = os.path.join(build_dir, 'index.html') - webbrowser.open_new_tab(path) - -@task -def docs(ctx, clean=False, browse=False): - if clean: - clean_docs(ctx) - ctx.run("sphinx-build %s %s" % (docs_dir, build_dir), pty=True) - if browse: - browse_docs(ctx) - -@task -def readme(ctx, browse=False): - ctx.run("rst2html.py README.rst > README.html", pty=True) - if browse: - webbrowser.open_new_tab('README.html') - -@task -def doctest(ctx): - os.chdir(docs_dir) - ctx.run("make doctest") diff --git a/tests/test_blob.py b/tests/test_blob.py index 3e5c1f35..2be94f36 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -1,196 +1,181 @@ -# -*- coding: utf-8 -*- """ Tests for the text processor. """ -from __future__ import unicode_literals import json -from unittest import TestCase, main from datetime import datetime -import mock +from unittest import TestCase -from nose.tools import * # noqa (PEP8 asserts) -from nose.plugins.attrib import attr import nltk +import pytest -from textblob.compat import PY2, unicode, basestring, binary_type import textblob as tb +import textblob.wordnet as wn +from textblob.classifiers import NaiveBayesClassifier from textblob.np_extractors import ConllExtractor, FastNPExtractor -from textblob.taggers import NLTKTagger, PatternTagger -from textblob.tokenizers import WordTokenizer, SentenceTokenizer -from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer from textblob.parsers import PatternParser -from textblob.classifiers import NaiveBayesClassifier -import textblob.wordnet as wn +from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer +from textblob.taggers import NLTKTagger, PatternTagger +from textblob.tokenizers import SentenceTokenizer, WordTokenizer Synset = nltk.corpus.reader.Synset train = [ - ('I love this sandwich.', 'pos'), - ('This is an amazing place!', 'pos'), - ("What a truly amazing dinner.", 'pos'), - ('I feel very good about these beers.', 'pos'), - ('This is my best work.', 'pos'), - ("What an awesome view", 'pos'), - ('I do not like this restaurant', 'neg'), - ('I am tired of this stuff.', 'neg'), - ("I can't deal with this", 'neg'), - ('He is my sworn enemy!', 'neg'), - ('My boss is horrible.', 'neg') + ("I love this sandwich.", "pos"), + ("This is an amazing place!", "pos"), + ("What a truly amazing dinner.", "pos"), + ("I feel very good about these beers.", "pos"), + ("This is my best work.", "pos"), + ("What an awesome view", "pos"), + ("I do not like this restaurant", "neg"), + ("I am tired of this stuff.", "neg"), + ("I can't deal with this", "neg"), + ("He is my sworn enemy!", "neg"), + ("My boss is horrible.", "neg"), ] test = [ - ('The beer was good.', 'pos'), - ('I do not enjoy my job', 'neg'), - ("I ain't feeling dandy today.", 'neg'), - ("I feel amazing!", 'pos'), - ('Gary is a friend of mine.', 'pos'), - ("I can't believe I'm doing this.", 'neg') + ("The beer was good.", "pos"), + ("I do not enjoy my job", "neg"), + ("I ain't feeling dandy today.", "neg"), + ("I feel amazing!", "pos"), + ("Gary is a friend of mine.", "pos"), + ("I can't believe I'm doing this.", "neg"), ] classifier = NaiveBayesClassifier(train) -class WordListTest(TestCase): +class WordListTest(TestCase): def setUp(self): - self.words = 'Beautiful is better than ugly'.split() - self.mixed = ['dog', 'dogs', 'blob', 'Blobs', 'text'] + self.words = "Beautiful is better than ugly".split() + self.mixed = ["dog", "dogs", "blob", "Blobs", "text"] def test_len(self): - wl = tb.WordList(['Beautiful', 'is', 'better']) - assert_equal(len(wl), 3) + wl = tb.WordList(["Beautiful", "is", "better"]) + assert len(wl) == 3 def test_slicing(self): wl = tb.WordList(self.words) first = wl[0] - assert_true(isinstance(first, tb.Word)) - assert_equal(first, 'Beautiful') + assert isinstance(first, tb.Word) + assert first == "Beautiful" dogs = wl[0:2] - assert_true(isinstance(dogs, tb.WordList)) - assert_equal(dogs, tb.WordList(['Beautiful', 'is'])) + assert isinstance(dogs, tb.WordList) + assert dogs == tb.WordList(["Beautiful", "is"]) def test_repr(self): - wl = tb.WordList(['Beautiful', 'is', 'better']) - if PY2: - assert_equal(repr(wl), "WordList([u'Beautiful', u'is', u'better'])") - else: - assert_equal(repr(wl), "WordList(['Beautiful', 'is', 'better'])") + wl = tb.WordList(["Beautiful", "is", "better"]) + assert repr(wl) == "WordList(['Beautiful', 'is', 'better'])" def test_slice_repr(self): - wl = tb.WordList(['Beautiful', 'is', 'better']) - if PY2: - assert_equal(repr(wl[:2]), "WordList([u'Beautiful', u'is'])") - else: - assert_equal(repr(wl[:2]), "WordList(['Beautiful', 'is'])") + wl = tb.WordList(["Beautiful", "is", "better"]) + assert repr(wl[:2]) == "WordList(['Beautiful', 'is'])" def test_str(self): wl = tb.WordList(self.words) - assert_equal(str(wl), str(self.words)) + assert str(wl) == str(self.words) def test_singularize(self): - wl = tb.WordList(['dogs', 'cats', 'buffaloes', 'men', 'mice', 'offspring']) - assert_equal(wl.singularize(), - tb.WordList(['dog', 'cat', 'buffalo', 'man', 'mouse', 'offspring'])) + wl = tb.WordList(["dogs", "cats", "buffaloes", "men", "mice", "offspring"]) + assert wl.singularize() == tb.WordList( + ["dog", "cat", "buffalo", "man", "mouse", "offspring"] + ) def test_pluralize(self): - wl = tb.WordList(['dog', 'cat', 'buffalo', 'antelope']) - assert_equal(wl.pluralize(), tb.WordList(['dogs', 'cats', 'buffaloes', 'antelope'])) + wl = tb.WordList(["dog", "cat", "buffalo", "antelope"]) + assert wl.pluralize() == tb.WordList(["dogs", "cats", "buffaloes", "antelope"]) - @attr('slow') + @pytest.mark.slow def test_lemmatize(self): wl = tb.WordList(["cat", "dogs", "oxen"]) - assert_equal(wl.lemmatize(), tb.WordList(['cat', 'dog', 'ox'])) + assert wl.lemmatize() == tb.WordList(["cat", "dog", "ox"]) - def test_stem(self): #only PorterStemmer tested + def test_stem(self): # only PorterStemmer tested wl = tb.WordList(["cat", "dogs", "oxen"]) - assert_equal(wl.stem(), tb.WordList(['cat', 'dog', 'oxen'])) + assert wl.stem() == tb.WordList(["cat", "dog", "oxen"]) def test_upper(self): wl = tb.WordList(self.words) - assert_equal(wl.upper(), tb.WordList([w.upper() for w in self.words])) + assert wl.upper() == tb.WordList([w.upper() for w in self.words]) def test_lower(self): - wl = tb.WordList(['Zen', 'oF', 'PYTHON']) - assert_equal(wl.lower(), tb.WordList(['zen', 'of', 'python'])) + wl = tb.WordList(["Zen", "oF", "PYTHON"]) + assert wl.lower() == tb.WordList(["zen", "of", "python"]) def test_count(self): - wl = tb.WordList(['monty', 'python', 'Python', 'Monty']) - assert_equal(wl.count('monty'), 2) - assert_equal(wl.count('monty', case_sensitive=True), 1) - assert_equal(wl.count('mon'), 0) + wl = tb.WordList(["monty", "python", "Python", "Monty"]) + assert wl.count("monty") == 2 + assert wl.count("monty", case_sensitive=True) == 1 + assert wl.count("mon") == 0 def test_convert_to_list(self): wl = tb.WordList(self.words) - assert_equal(list(wl), self.words) + assert list(wl) == self.words def test_append(self): - wl = tb.WordList(['dog']) + wl = tb.WordList(["dog"]) wl.append("cat") - assert_true(isinstance(wl[1], tb.Word)) - wl.append(('a', 'tuple')) - assert_true(isinstance(wl[2], tuple)) + assert isinstance(wl[1], tb.Word) + wl.append(("a", "tuple")) + assert isinstance(wl[2], tuple) def test_extend(self): wl = tb.WordList(["cats", "dogs"]) wl.extend(["buffalo", 4]) - assert_true(isinstance(wl[2], tb.Word)) - assert_true(isinstance(wl[3], int)) + assert isinstance(wl[2], tb.Word) + assert isinstance(wl[3], int) def test_pop(self): - wl = tb.WordList(['cats', 'dogs']) - assert_equal(wl.pop(), tb.Word('dogs')) - assert_raises(IndexError, wl.__getitem__, 1) - assert_equal(wl.pop(), tb.Word('cats')) - assert_equal(len(wl), 0) - assert_raises(IndexError, wl.pop) + wl = tb.WordList(["cats", "dogs"]) + assert wl.pop() == tb.Word("dogs") + with pytest.raises(IndexError): + wl[1] + assert wl.pop() == tb.Word("cats") + assert len(wl) == 0 + with pytest.raises(IndexError): + wl.pop() def test_setitem(self): - wl = tb.WordList(['I', 'love', 'JavaScript']) - wl[2] = tb.Word('Python') - assert_equal(wl[2], tb.Word('Python')) + wl = tb.WordList(["I", "love", "JavaScript"]) + wl[2] = tb.Word("Python") + assert wl[2] == tb.Word("Python") def test_reverse(self): - wl = tb.WordList(['head', 'shoulders', 'knees', 'toes']) + wl = tb.WordList(["head", "shoulders", "knees", "toes"]) wl.reverse() - assert_equal(list(wl), ['toes', 'knees', 'shoulders', 'head']) - + assert list(wl) == ["toes", "knees", "shoulders", "head"] class SentenceTest(TestCase): - def setUp(self): - self.raw_sentence = \ - 'Any place with frites and Belgian beer has my vote.' + self.raw_sentence = "Any place with frites and Belgian beer has my vote." self.sentence = tb.Sentence(self.raw_sentence) def test_repr(self): - # In Py2, repr returns bytestring - if PY2: - assert_equal(repr(self.sentence), - b"Sentence(\"{0}\")".format(binary_type(self.raw_sentence))) - # In Py3, returns text type string - else: - assert_equal(repr(self.sentence), 'Sentence("{0}")'.format(self.raw_sentence)) + assert repr(self.sentence) == f'Sentence("{self.raw_sentence}")' def test_stripped_sentence(self): - assert_equal(self.sentence.stripped, - 'any place with frites and belgian beer has my vote') + assert ( + self.sentence.stripped + == "any place with frites and belgian beer has my vote" + ) def test_len(self): - assert_equal(len(self.sentence), len(self.raw_sentence)) + assert len(self.sentence) == len(self.raw_sentence) - @attr('slow') + @pytest.mark.slow def test_dict(self): sentence_dict = self.sentence.dict - assert_equal(sentence_dict, { - 'raw': self.raw_sentence, - 'start_index': 0, - 'polarity': 0.0, - 'subjectivity': 0.0, - 'end_index': len(self.raw_sentence) - 1, - 'stripped': 'any place with frites and belgian beer has my vote', - 'noun_phrases': self.sentence.noun_phrases, - }) + assert sentence_dict == { + "raw": self.raw_sentence, + "start_index": 0, + "polarity": 0.0, + "subjectivity": 0.0, + "end_index": len(self.raw_sentence) - 1, + "stripped": "any place with frites and belgian beer has my vote", + "noun_phrases": self.sentence.noun_phrases, + } def test_pos_tags(self): then1 = datetime.now() @@ -205,58 +190,45 @@ def test_pos_tags(self): # Getting the pos tags the second time should be faster # because they were stored as an attribute the first time - assert_true(t2 < t1) - assert_equal(tagged, - [('Any', 'DT'), ('place', 'NN'), ('with', 'IN'), - ('frites', 'NNS'), ('and', 'CC'), ('Belgian', 'JJ'), - ('beer', 'NN'), ('has', 'VBZ'), ('my', 'PRP$'), - ('vote', 'NN')] - ) - - @attr('slow') + assert t2 < t1 + assert tagged == [ + ("Any", "DT"), + ("place", "NN"), + ("with", "IN"), + ("frites", "NNS"), + ("and", "CC"), + ("Belgian", "JJ"), + ("beer", "NN"), + ("has", "VBZ"), + ("my", "PRP$"), + ("vote", "NN"), + ] + + @pytest.mark.slow def test_noun_phrases(self): nps = self.sentence.noun_phrases - assert_equal(nps, ['belgian beer']) + assert nps == ["belgian beer"] def test_words_are_word_objects(self): words = self.sentence.words - assert_true(isinstance(words[0], tb.Word)) - assert_equal(words[1].pluralize(), 'places') + assert isinstance(words[0], tb.Word) + assert words[1].pluralize() == "places" def test_string_equality(self): - assert_equal(self.sentence, 'Any place with frites and Belgian beer has my vote.') - - @mock.patch('textblob.translate.Translator.translate') - def test_translate(self, mock_translate): - mock_translate.return_value = 'Esta es una frase.' - blob = tb.Sentence("This is a sentence.") - translated = blob.translate(to="es") - assert_true(isinstance(translated, tb.Sentence)) - assert_equal(translated, "Esta es una frase.") + assert self.sentence == "Any place with frites and Belgian beer has my vote." def test_correct(self): blob = tb.Sentence("I havv bad speling.") - assert_true(isinstance(blob.correct(), tb.Sentence)) - assert_equal(blob.correct(), tb.Sentence("I have bad spelling.")) + assert isinstance(blob.correct(), tb.Sentence) + assert blob.correct() == tb.Sentence("I have bad spelling.") blob = tb.Sentence("I havv \ngood speling.") - assert_true(isinstance(blob.correct(), tb.Sentence)) - assert_equal(blob.correct(), tb.Sentence("I have \ngood spelling.")) - - - @mock.patch('textblob.translate.Translator.translate') - def test_translate_detects_language_by_default(self, mock_translate): - text = unicode("ذات سيادة كاملة") - mock_translate.return_value = "With full sovereignty" - blob = tb.TextBlob(text) - blob.translate() - assert_true(mock_translate.called_once_with(text, from_lang='auto')) + assert isinstance(blob.correct(), tb.Sentence) + assert blob.correct() == tb.Sentence("I have \ngood spelling.") class TextBlobTest(TestCase): - def setUp(self): - self.text = \ - """Beautiful is better than ugly. + self.text = """Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. @@ -277,7 +249,7 @@ def setUp(self): Namespaces are one honking great idea -- let's do more of those!""" self.blob = tb.TextBlob(self.text) - self.np_test_text = ''' + self.np_test_text = """ Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer @@ -291,775 +263,757 @@ def setUp(self): Using third-party tools, Python code can be packaged into standalone executable programs. Python interpreters are available for many operating systems. CPython, the reference implementation of Python, is free and open source software and h as a community-based development model, as do nearly all of its alternative implementations. CPython -is managed by the non-profit Python Software Foundation.''' +is managed by the non-profit Python Software Foundation.""" # noqa: E501 self.np_test_blob = tb.TextBlob(self.np_test_text) self.short = "Beautiful is better than ugly. " self.short_blob = tb.TextBlob(self.short) def test_init(self): - blob = tb.TextBlob('Wow I love this place. It really rocks my socks!') - assert_equal(len(blob.sentences), 2) - assert_equal(blob.sentences[1].stripped, 'it really rocks my socks') - assert_equal(blob.string, blob.raw) + blob = tb.TextBlob("Wow I love this place. It really rocks my socks!") + assert len(blob.sentences) == 2 + assert blob.sentences[1].stripped == "it really rocks my socks" + assert blob.string == blob.raw # Must initialize with a string - assert_raises(TypeError, tb.TextBlob.__init__, ['invalid']) + with pytest.raises(TypeError): + tb.TextBlob(["invalid"]) def test_string_equality(self): blob = tb.TextBlob("Textblobs should be equal to strings.") - assert_equal(blob, "Textblobs should be equal to strings.") + assert blob == "Textblobs should be equal to strings." def test_string_comparison(self): blob = tb.TextBlob("apple") - assert_true(blob < "banana") - assert_true(blob > 'aardvark') + assert blob < "banana" + assert blob > "aardvark" def test_hash(self): - blob = tb.TextBlob('apple') - assert_equal(hash(blob), hash('apple')) - assert_not_equal(hash(blob), hash('banana')) + blob = tb.TextBlob("apple") + assert hash(blob) == hash("apple") + assert hash(blob) != hash("banana") def test_stripped(self): blob = tb.TextBlob("Um... well this ain't right.!..") - assert_equal(blob.stripped, "um well this aint right") + assert blob.stripped == "um well this aint right" def test_ngrams(self): blob = tb.TextBlob("I am eating a pizza.") three_grams = blob.ngrams() - assert_equal(three_grams, [ - tb.WordList(('I', 'am', 'eating')), - tb.WordList(('am', 'eating', 'a')), - tb.WordList(('eating', 'a', 'pizza')) - ]) + assert three_grams == [ + tb.WordList(("I", "am", "eating")), + tb.WordList(("am", "eating", "a")), + tb.WordList(("eating", "a", "pizza")), + ] four_grams = blob.ngrams(n=4) - assert_equal(four_grams, [ - tb.WordList(('I', 'am', 'eating', 'a')), - tb.WordList(('am', 'eating', 'a', 'pizza')) - ]) + assert four_grams == [ + tb.WordList(("I", "am", "eating", "a")), + tb.WordList(("am", "eating", "a", "pizza")), + ] def test_clean_html(self): - html = 'Python is a widely used general-purpose, high-level programming language.' - assert_raises(NotImplementedError, lambda: tb.TextBlob(html, clean_html=True)) + html = ( + "Python is a widely used " + 'general-purpose, ' + '' + "high-level programming language." + ) + with pytest.raises(NotImplementedError): + tb.TextBlob(html, clean_html=True) def test_sentences(self): blob = self.blob - assert_equal(len(blob.sentences), 19) - assert_true(isinstance(blob.sentences[0], tb.Sentence)) + assert len(blob.sentences) == 19 + assert isinstance(blob.sentences[0], tb.Sentence) def test_senences_with_space_before_punctuation(self): text = "Uh oh. This sentence might cause some problems. : Now we're ok." b = tb.TextBlob(text) - assert_equal(len(b.sentences), 3) + assert len(b.sentences) == 3 def test_sentiment_of_foreign_text(self): - blob = tb.TextBlob(u'Nous avons cherch\xe9 un motel dans la r\xe9gion de ' - 'Madison, mais les motels ne sont pas nombreux et nous avons ' - 'finalement choisi un Motel 6, attir\xe9s par le bas ' - 'prix de la chambre.') - assert_true(isinstance(blob.sentiment[0], float)) + blob = tb.TextBlob( + "Nous avons cherch\xe9 un motel dans la r\xe9gion de " + "Madison, mais les motels ne sont pas nombreux et nous avons " + "finalement choisi un Motel 6, attir\xe9s par le bas " + "prix de la chambre." + ) + assert isinstance(blob.sentiment[0], float) def test_iter(self): for i, letter in enumerate(self.short_blob): - assert_equal(letter, self.short[i]) + assert letter == self.short[i] def test_raw_sentences(self): blob = tb.TextBlob(self.text) - assert_equal(len(blob.raw_sentences), 19) - assert_equal(blob.raw_sentences[0], "Beautiful is better than ugly.") + assert len(blob.raw_sentences) == 19 + assert blob.raw_sentences[0] == "Beautiful is better than ugly." def test_blob_with_no_sentences(self): text = "this isn't really a sentence it's just a long string of words" blob = tb.TextBlob(text) # the blob just has one sentence - assert_equal(len(blob.sentences), 1) + assert len(blob.sentences) == 1 # the start index is 0, the end index is len(text) - 1 - assert_equal(blob.sentences[0].start_index, 0) - assert_equal(blob.sentences[0].end_index, len(text)) + assert blob.sentences[0].start_index == 0 + assert blob.sentences[0].end_index == len(text) def test_len(self): - blob = tb.TextBlob('lorem ipsum') - assert_equal(len(blob), len('lorem ipsum')) + blob = tb.TextBlob("lorem ipsum") + assert len(blob) == len("lorem ipsum") def test_repr(self): - blob1 = tb.TextBlob('lorem ipsum') - if PY2: - assert_equal(repr(blob1), b"TextBlob(\"{0}\")".format(binary_type('lorem ipsum'))) - else: - assert_equal(repr(blob1), "TextBlob(\"{0}\")".format('lorem ipsum')) + blob1 = tb.TextBlob("lorem ipsum") + assert repr(blob1) == 'TextBlob("{}")'.format("lorem ipsum") def test_cmp(self): - blob1 = tb.TextBlob('lorem ipsum') - blob2 = tb.TextBlob('lorem ipsum') - blob3 = tb.TextBlob('dolor sit amet') + blob1 = tb.TextBlob("lorem ipsum") + blob2 = tb.TextBlob("lorem ipsum") + blob3 = tb.TextBlob("dolor sit amet") - assert_true(blob1 == blob2) # test == - assert_true(blob1 > blob3) # test > - assert_true(blob1 >= blob3) # test >= - assert_true(blob3 < blob2) # test < - assert_true(blob3 <= blob2) # test <= + assert blob1 == blob2 # test == + assert blob1 > blob3 # test > + assert blob1 >= blob3 # test >= + assert blob3 < blob2 # test < + assert blob3 <= blob2 # test <= def test_invalid_comparison(self): blob = tb.TextBlob("one") - if PY2: - # invalid comparison returns False - assert_false(blob < 2) - else: - # invalid comparison raises Error - with assert_raises(TypeError): - blob < 2 + # invalid comparison raises Error + with pytest.raises(TypeError): + blob < 2 # noqa: B015 def test_words(self): - blob = tb.TextBlob('Beautiful is better than ugly. ' - 'Explicit is better than implicit.') - assert_true(isinstance(blob.words, tb.WordList)) - assert_equal(blob.words, tb.WordList([ - 'Beautiful', - 'is', - 'better', - 'than', - 'ugly', - 'Explicit', - 'is', - 'better', - 'than', - 'implicit', - ])) + blob = tb.TextBlob( + "Beautiful is better than ugly. " "Explicit is better than implicit." + ) + assert isinstance(blob.words, tb.WordList) + assert blob.words == tb.WordList( + [ + "Beautiful", + "is", + "better", + "than", + "ugly", + "Explicit", + "is", + "better", + "than", + "implicit", + ] + ) short = tb.TextBlob("Just a bundle of words") - assert_equal(short.words, tb.WordList([ - 'Just', 'a', 'bundle', 'of', 'words' - ])) + assert short.words == tb.WordList(["Just", "a", "bundle", "of", "words"]) def test_words_includes_apostrophes_in_contractions(self): blob = tb.TextBlob("Let's test this.") - assert_equal(blob.words, tb.WordList(['Let', "'s", "test", "this"])) + assert blob.words == tb.WordList(["Let", "'s", "test", "this"]) blob2 = tb.TextBlob("I can't believe it's not butter.") - assert_equal(blob2.words, tb.WordList(['I', 'ca', "n't", "believe", - 'it', "'s", "not", "butter"])) + assert blob2.words == tb.WordList( + ["I", "ca", "n't", "believe", "it", "'s", "not", "butter"] + ) def test_pos_tags(self): - blob = tb.TextBlob('Simple is better than complex. ' - 'Complex is better than complicated.') - assert_equal(blob.pos_tags, [ - ('Simple', 'NN'), - ('is', 'VBZ'), - ('better', 'JJR'), - ('than', 'IN'), - ('complex', 'JJ'), - ('Complex', 'NNP'), - ('is', 'VBZ'), - ('better', 'JJR'), - ('than', 'IN'), - ('complicated', 'VBN'), - ]) + blob = tb.TextBlob( + "Simple is better than complex. " "Complex is better than complicated." + ) + assert blob.pos_tags == [ + ("Simple", "NN"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complex", "JJ"), + ("Complex", "NNP"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complicated", "VBN"), + ] def test_tags(self): - assert_equal(self.blob.tags, self.blob.pos_tags) + assert self.blob.tags == self.blob.pos_tags def test_tagging_nonascii(self): - b = tb.TextBlob('Learn how to make the five classic French mother sauces: ' - 'Béchamel, Tomato Sauce, Espagnole, Velouté and Hollandaise.') + b = tb.TextBlob( + "Learn how to make the five classic French mother sauces: " + "Béchamel, Tomato Sauce, Espagnole, Velouté and Hollandaise." + ) tags = b.tags - assert_true(isinstance(tags[0][0], unicode)) + assert isinstance(tags[0][0], str) def test_pos_tags_includes_one_letter_articles(self): blob = tb.TextBlob("This is a sentence.") - assert_equal(blob.pos_tags[2][0], 'a') + assert blob.pos_tags[2][0] == "a" - @attr('slow') + @pytest.mark.slow def test_np_extractor_defaults_to_fast_tagger(self): text = "Python is a high-level scripting language." blob1 = tb.TextBlob(text) - assert_true(isinstance(blob1.np_extractor, FastNPExtractor)) + assert isinstance(blob1.np_extractor, FastNPExtractor) def test_np_extractor_is_shared_among_instances(self): blob1 = tb.TextBlob("This is one sentence") blob2 = tb.TextBlob("This is another sentence") - assert_true(blob1.np_extractor is blob2.np_extractor) + assert blob1.np_extractor is blob2.np_extractor - @attr('slow') + @pytest.mark.slow def test_can_use_different_np_extractors(self): e = ConllExtractor() text = "Python is a high-level scripting language." blob = tb.TextBlob(text) blob.np_extractor = e - assert_true(isinstance(blob.np_extractor, ConllExtractor)) + assert isinstance(blob.np_extractor, ConllExtractor) def test_can_use_different_sentanalyzer(self): blob = tb.TextBlob("I love this car", analyzer=NaiveBayesAnalyzer()) - assert_true(isinstance(blob.analyzer, NaiveBayesAnalyzer)) + assert isinstance(blob.analyzer, NaiveBayesAnalyzer) - @attr("slow") + @pytest.mark.slow def test_discrete_sentiment(self): blob = tb.TextBlob("I feel great today.", analyzer=NaiveBayesAnalyzer()) - assert_equal(blob.sentiment[0], 'pos') + assert blob.sentiment[0] == "pos" def test_can_get_subjectivity_and_polarity_with_different_analyzer(self): blob = tb.TextBlob("I love this car.", analyzer=NaiveBayesAnalyzer()) pattern = PatternAnalyzer() - assert_equal(blob.polarity, pattern.analyze(str(blob))[0]) - assert_equal(blob.subjectivity, pattern.analyze(str(blob))[1]) + assert blob.polarity == pattern.analyze(str(blob))[0] + assert blob.subjectivity == pattern.analyze(str(blob))[1] def test_pos_tagger_defaults_to_pattern(self): blob = tb.TextBlob("some text") - assert_true(isinstance(blob.pos_tagger, NLTKTagger)) + assert isinstance(blob.pos_tagger, NLTKTagger) def test_pos_tagger_is_shared_among_instances(self): blob1 = tb.TextBlob("This is one sentence") blob2 = tb.TextBlob("This is another sentence.") - assert_true(blob1.pos_tagger is blob2.pos_tagger) + assert blob1.pos_tagger is blob2.pos_tagger def test_can_use_different_pos_tagger(self): tagger = NLTKTagger() blob = tb.TextBlob("this is some text", pos_tagger=tagger) - assert_true(isinstance(blob.pos_tagger, NLTKTagger)) + assert isinstance(blob.pos_tagger, NLTKTagger) - @attr('slow') + @pytest.mark.slow def test_can_pass_np_extractor_to_constructor(self): e = ConllExtractor() - blob = tb.TextBlob('Hello world!', np_extractor=e) - assert_true(isinstance(blob.np_extractor, ConllExtractor)) + blob = tb.TextBlob("Hello world!", np_extractor=e) + assert isinstance(blob.np_extractor, ConllExtractor) def test_getitem(self): - blob = tb.TextBlob('lorem ipsum') - assert_equal(blob[0], 'l') - assert_equal(blob[0:5], tb.TextBlob('lorem')) + blob = tb.TextBlob("lorem ipsum") + assert blob[0] == "l" + assert blob[0:5] == tb.TextBlob("lorem") def test_upper(self): - blob = tb.TextBlob('lorem ipsum') - assert_true(is_blob(blob.upper())) - assert_equal(blob.upper(), tb.TextBlob('LOREM IPSUM')) + blob = tb.TextBlob("lorem ipsum") + assert is_blob(blob.upper()) + assert blob.upper() == tb.TextBlob("LOREM IPSUM") def test_upper_and_words(self): - blob = tb.TextBlob('beautiful is better') - assert_equal(blob.upper().words, tb.WordList(['BEAUTIFUL', 'IS', 'BETTER' - ])) + blob = tb.TextBlob("beautiful is better") + assert blob.upper().words == tb.WordList(["BEAUTIFUL", "IS", "BETTER"]) def test_lower(self): - blob = tb.TextBlob('Lorem Ipsum') - assert_true(is_blob(blob.lower())) - assert_equal(blob.lower(), tb.TextBlob('lorem ipsum')) + blob = tb.TextBlob("Lorem Ipsum") + assert is_blob(blob.lower()) + assert blob.lower() == tb.TextBlob("lorem ipsum") def test_find(self): - text = 'Beautiful is better than ugly.' + text = "Beautiful is better than ugly." blob = tb.TextBlob(text) - assert_equal(blob.find('better', 5, len(blob)), text.find('better', 5, - len(text))) + assert blob.find("better", 5, len(blob)) == text.find("better", 5, len(text)) def test_rfind(self): - text = 'Beautiful is better than ugly. ' + text = "Beautiful is better than ugly. " blob = tb.TextBlob(text) - assert_equal(blob.rfind('better'), text.rfind('better')) + assert blob.rfind("better") == text.rfind("better") def test_startswith(self): blob = tb.TextBlob(self.text) - assert_true(blob.startswith('Beautiful')) - assert_true(blob.starts_with('Beautiful')) + assert blob.startswith("Beautiful") + assert blob.starts_with("Beautiful") def test_endswith(self): blob = tb.TextBlob(self.text) - assert_true(blob.endswith('of those!')) - assert_true(blob.ends_with('of those!')) + assert blob.endswith("of those!") + assert blob.ends_with("of those!") def test_split(self): - blob = tb.TextBlob('Beautiful is better') - assert_equal(blob.split(), tb.WordList(['Beautiful', 'is', 'better'])) + blob = tb.TextBlob("Beautiful is better") + assert blob.split() == tb.WordList(["Beautiful", "is", "better"]) def test_title(self): - blob = tb.TextBlob('Beautiful is better') - assert_equal(blob.title(), tb.TextBlob('Beautiful Is Better')) + blob = tb.TextBlob("Beautiful is better") + assert blob.title() == tb.TextBlob("Beautiful Is Better") def test_format(self): - blob = tb.TextBlob('1 + 1 = {0}') - assert_equal(blob.format(1 + 1), tb.TextBlob('1 + 1 = 2')) - assert_equal('1 + 1 = {0}'.format(tb.TextBlob('2')), '1 + 1 = 2') + blob = tb.TextBlob("1 + 1 = {0}") + assert blob.format(1 + 1) == tb.TextBlob("1 + 1 = 2") + assert "1 + 1 = {}".format(tb.TextBlob("2")) == "1 + 1 = 2" def test_using_indices_for_slicing(self): blob = tb.TextBlob("Hello world. How do you do?") sent1, sent2 = blob.sentences - assert_equal(blob[sent1.start:sent1.end], tb.TextBlob(str(sent1))) - assert_equal(blob[sent2.start:sent2.end], tb.TextBlob(str(sent2))) - + assert blob[sent1.start : sent1.end] == tb.TextBlob(str(sent1)) + assert blob[sent2.start : sent2.end] == tb.TextBlob(str(sent2)) def test_indices_with_only_one_sentences(self): blob = tb.TextBlob("Hello world.") sent1 = blob.sentences[0] - assert_equal(blob[sent1.start:sent1.end], tb.TextBlob(str(sent1))) + assert blob[sent1.start : sent1.end] == tb.TextBlob(str(sent1)) def test_indices_with_multiple_puncutations(self): blob = tb.TextBlob("Hello world. How do you do?! This has an ellipses...") sent1, sent2, sent3 = blob.sentences - assert_equal(blob[sent2.start:sent2.end], tb.TextBlob("How do you do?!")) - assert_equal(blob[sent3.start:sent3.end], tb.TextBlob("This has an ellipses...")) + assert blob[sent2.start : sent2.end] == tb.TextBlob("How do you do?!") + assert blob[sent3.start : sent3.end] == tb.TextBlob("This has an ellipses...") def test_indices_short_names(self): blob = tb.TextBlob(self.text) last_sentence = blob.sentences[len(blob.sentences) - 1] - assert_equal(last_sentence.start, last_sentence.start_index) - assert_equal(last_sentence.end, last_sentence.end_index) + assert last_sentence.start == last_sentence.start_index + assert last_sentence.end == last_sentence.end_index def test_replace(self): - blob = tb.TextBlob('textblob is a blobby blob') - assert_equal(blob.replace('blob', 'bro'), - tb.TextBlob('textbro is a broby bro')) - assert_equal(blob.replace('blob', 'bro', 1), - tb.TextBlob('textbro is a blobby blob')) + blob = tb.TextBlob("textblob is a blobby blob") + assert blob.replace("blob", "bro") == tb.TextBlob("textbro is a broby bro") + assert blob.replace("blob", "bro", 1) == tb.TextBlob("textbro is a blobby blob") def test_join(self): - l = ['explicit', 'is', 'better'] - wl = tb.WordList(l) - assert_equal(tb.TextBlob(' ').join(l), tb.TextBlob('explicit is better')) - assert_equal(tb.TextBlob(' ').join(wl), tb.TextBlob('explicit is better')) + lst = ["explicit", "is", "better"] + wl = tb.WordList(lst) + assert tb.TextBlob(" ").join(lst) == tb.TextBlob("explicit is better") + assert tb.TextBlob(" ").join(wl) == tb.TextBlob("explicit is better") - @attr('slow') + @pytest.mark.slow def test_blob_noun_phrases(self): noun_phrases = self.np_test_blob.noun_phrases - assert_true('python' in noun_phrases) - assert_true('design philosophy' in noun_phrases) + assert "python" in noun_phrases + assert "design philosophy" in noun_phrases def test_word_counts(self): - blob = tb.TextBlob('Buffalo buffalo ate my blue buffalo.') - assert_equal(dict(blob.word_counts), { - 'buffalo': 3, - 'ate': 1, - 'my': 1, - 'blue': 1 - }) - assert_equal(blob.word_counts['buffalo'], 3) - assert_equal(blob.words.count('buffalo'), 3) - assert_equal(blob.words.count('buffalo', case_sensitive=True), 2) - assert_equal(blob.word_counts['blue'], 1) - assert_equal(blob.words.count('blue'), 1) - assert_equal(blob.word_counts['ate'], 1) - assert_equal(blob.words.count('ate'), 1) - assert_equal(blob.word_counts['buff'], 0) - assert_equal(blob.words.count('buff'), 0) + blob = tb.TextBlob("Buffalo buffalo ate my blue buffalo.") + assert dict(blob.word_counts) == {"buffalo": 3, "ate": 1, "my": 1, "blue": 1} + assert blob.word_counts["buffalo"] == 3 + assert blob.words.count("buffalo") == 3 + assert blob.words.count("buffalo", case_sensitive=True) == 2 + assert blob.word_counts["blue"] == 1 + assert blob.words.count("blue") == 1 + assert blob.word_counts["ate"] == 1 + assert blob.words.count("ate") == 1 + assert blob.word_counts["buff"] == 0 + assert blob.words.count("buff") == 0 blob2 = tb.TextBlob(self.text) - assert_equal(blob2.words.count('special'), 2) - assert_equal(blob2.words.count('special', case_sensitive=True), 1) + assert blob2.words.count("special") == 2 + assert blob2.words.count("special", case_sensitive=True) == 1 - @attr('slow') + @pytest.mark.slow def test_np_counts(self): # Add some text so that we have a noun phrase that # has a frequency greater than 1 noun_phrases = self.np_test_blob.noun_phrases - assert_equal(noun_phrases.count('python'), 6) - assert_equal(self.np_test_blob.np_counts['python'], noun_phrases.count('python')) - assert_equal(noun_phrases.count('cpython'), 2) - assert_equal(noun_phrases.count('not found'), 0) + assert noun_phrases.count("python") == 6 + assert self.np_test_blob.np_counts["python"] == noun_phrases.count("python") + assert noun_phrases.count("cpython") == 2 + assert noun_phrases.count("not found") == 0 def test_add(self): - blob1 = tb.TextBlob('Hello, world! ') - blob2 = tb.TextBlob('Hola mundo!') + blob1 = tb.TextBlob("Hello, world! ") + blob2 = tb.TextBlob("Hola mundo!") # Can add two text blobs - assert_equal(blob1 + blob2, tb.TextBlob('Hello, world! Hola mundo!')) + assert blob1 + blob2 == tb.TextBlob("Hello, world! Hola mundo!") # Can also add a string to a tb.TextBlob - assert_equal(blob1 + 'Hola mundo!', - tb.TextBlob('Hello, world! Hola mundo!')) + assert blob1 + "Hola mundo!" == tb.TextBlob("Hello, world! Hola mundo!") # Or both - assert_equal(blob1 + blob2 + ' Goodbye!', - tb.TextBlob('Hello, world! Hola mundo! Goodbye!')) + assert blob1 + blob2 + " Goodbye!" == tb.TextBlob( + "Hello, world! Hola mundo! Goodbye!" + ) # operands must be strings - assert_raises(TypeError, blob1.__add__, ['hello']) + with pytest.raises(TypeError): + blob1 + ["hello"] def test_unicode(self): blob = tb.TextBlob(self.text) - assert_equal(str(blob), str(self.text)) + assert str(blob) == str(self.text) def test_strip(self): - text = 'Beautiful is better than ugly. ' + text = "Beautiful is better than ugly. " blob = tb.TextBlob(text) - assert_true(is_blob(blob)) - assert_equal(blob.strip(), tb.TextBlob(text.strip())) + assert is_blob(blob) + assert blob.strip() == tb.TextBlob(text.strip()) def test_strip_and_words(self): - blob = tb.TextBlob('Beautiful is better! ') - assert_equal(blob.strip().words, tb.WordList(['Beautiful', 'is', 'better' - ])) + blob = tb.TextBlob("Beautiful is better! ") + assert blob.strip().words == tb.WordList(["Beautiful", "is", "better"]) def test_index(self): blob = tb.TextBlob(self.text) - assert_equal(blob.index('Namespaces'), self.text.index('Namespaces')) + assert blob.index("Namespaces") == self.text.index("Namespaces") def test_sentences_after_concatenation(self): - blob1 = tb.TextBlob('Beautiful is better than ugly. ') - blob2 = tb.TextBlob('Explicit is better than implicit.') + blob1 = tb.TextBlob("Beautiful is better than ugly. ") + blob2 = tb.TextBlob("Explicit is better than implicit.") concatenated = blob1 + blob2 - assert_equal(len(concatenated.sentences), 2) + assert len(concatenated.sentences) == 2 def test_sentiment(self): - positive = tb.TextBlob('This is the best, most amazing ' - 'text-processing library ever!') - assert_true(positive.sentiment[0] > 0.0) + positive = tb.TextBlob( + "This is the best, most amazing " "text-processing library ever!" + ) + assert positive.sentiment[0] > 0.0 negative = tb.TextBlob("bad bad bitches that's my muthufuckin problem.") - assert_true(negative.sentiment[0] < 0.0) + assert negative.sentiment[0] < 0.0 zen = tb.TextBlob(self.text) - assert_equal(round(zen.sentiment[0], 1), 0.2) + assert round(zen.sentiment[0], 1) == 0.2 def test_subjectivity(self): positive = tb.TextBlob("Oh my god this is so amazing! I'm so happy!") - assert_true(isinstance(positive.subjectivity, float)) - assert_true(positive.subjectivity > 0) + assert isinstance(positive.subjectivity, float) + assert positive.subjectivity > 0 def test_polarity(self): positive = tb.TextBlob("Oh my god this is so amazing! I'm so happy!") - assert_true(isinstance(positive.polarity, float)) - assert_true(positive.polarity > 0) + assert isinstance(positive.polarity, float) + assert positive.polarity > 0 def test_sentiment_of_emoticons(self): b1 = tb.TextBlob("Faces have values =)") b2 = tb.TextBlob("Faces have values") - assert_true(b1.sentiment[0] > b2.sentiment[0]) + assert b1.sentiment[0] > b2.sentiment[0] def test_bad_init(self): - assert_raises(TypeError, lambda: tb.TextBlob(['bad'])) - assert_raises(ValueError, lambda: tb.TextBlob("this is fine", - np_extractor="this is not fine")) - assert_raises(ValueError, lambda: tb.TextBlob("this is fine", - pos_tagger="this is not fine")) + with pytest.raises(TypeError): + tb.TextBlob(["bad"]) + with pytest.raises(ValueError): + tb.TextBlob("this is fine", np_extractor="this is not fine") + with pytest.raises(ValueError): + tb.TextBlob("this is fine", pos_tagger="this is not fine") def test_in(self): - blob = tb.TextBlob('Beautiful is better than ugly. ') - assert_true('better' in blob) - assert_true('fugly' not in blob) + blob = tb.TextBlob("Beautiful is better than ugly. ") + assert "better" in blob + assert "fugly" not in blob - @attr('slow') + @pytest.mark.slow def test_json(self): - blob = tb.TextBlob('Beautiful is better than ugly. ') - assert_equal(blob.json, blob.to_json()) + blob = tb.TextBlob("Beautiful is better than ugly. ") + assert blob.json == blob.to_json() blob_dict = json.loads(blob.json)[0] - assert_equal(blob_dict['stripped'], 'beautiful is better than ugly') - assert_equal(blob_dict['noun_phrases'], blob.sentences[0].noun_phrases) - assert_equal(blob_dict['start_index'], blob.sentences[0].start) - assert_equal(blob_dict['end_index'], blob.sentences[0].end) - assert_almost_equal(blob_dict['polarity'], - blob.sentences[0].polarity, places=4) - assert_almost_equal(blob_dict['subjectivity'], - blob.sentences[0].subjectivity, places=4) + assert blob_dict["stripped"] == "beautiful is better than ugly" + assert blob_dict["noun_phrases"] == blob.sentences[0].noun_phrases + assert blob_dict["start_index"] == blob.sentences[0].start + assert blob_dict["end_index"] == blob.sentences[0].end + assert blob_dict["polarity"] == pytest.approx( + blob.sentences[0].polarity, abs=1e-4 + ) + assert blob_dict["subjectivity"] == pytest.approx( + blob.sentences[0].subjectivity, abs=1e-4 + ) def test_words_are_word_objects(self): words = self.blob.words - assert_true(isinstance(words[0], tb.Word)) + assert isinstance(words[0], tb.Word) def test_words_have_pos_tags(self): - blob = tb.TextBlob('Simple is better than complex. ' - 'Complex is better than complicated.') + blob = tb.TextBlob( + "Simple is better than complex. " "Complex is better than complicated." + ) first_word, first_tag = blob.pos_tags[0] - assert_true(isinstance(first_word, tb.Word)) - assert_equal(first_word.pos_tag, first_tag) + assert isinstance(first_word, tb.Word) + assert first_word.pos_tag == first_tag def test_tokenizer_defaults_to_word_tokenizer(self): - assert_true(isinstance(self.blob.tokenizer, WordTokenizer)) + assert isinstance(self.blob.tokenizer, WordTokenizer) def test_tokens_property(self): - assert_true(self.blob.tokens, - tb.WordList(WordTokenizer().tokenize(self.text))) + assert self.blob.tokens, tb.WordList(WordTokenizer().tokenize(self.text)) def test_can_use_an_different_tokenizer(self): tokenizer = nltk.tokenize.TabTokenizer() blob = tb.TextBlob("This is\ttext.", tokenizer=tokenizer) - assert_equal(blob.tokens, tb.WordList(["This is", "text."])) + assert blob.tokens == tb.WordList(["This is", "text."]) def test_tokenize_method(self): tokenizer = nltk.tokenize.TabTokenizer() blob = tb.TextBlob("This is\ttext.") # If called without arguments, should default to WordTokenizer - assert_equal(blob.tokenize(), tb.WordList(["This", "is", "text", "."])) + assert blob.tokenize() == tb.WordList(["This", "is", "text", "."]) # Pass in the TabTokenizer - assert_equal(blob.tokenize(tokenizer), tb.WordList(["This is", "text."])) + assert blob.tokenize(tokenizer) == tb.WordList(["This is", "text."]) def test_tags_uses_custom_tokenizer(self): tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer) - assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'VBP'), ( - u'3', u'CD'), (u'88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')]) + assert blob.tags == [ + ("Good", "JJ"), + ("muffins", "NNS"), + ("cost", "VBP"), + ("3", "CD"), + ("88", "CD"), + ("in", "IN"), + ("New", "NNP"), + ("York", "NNP"), + ] def test_tags_with_custom_tokenizer_and_tagger(self): tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() tagger = tb.taggers.PatternTagger() - blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer, pos_tagger=tagger) + blob = tb.TextBlob( + "Good muffins cost $3.88\nin New York.", + tokenizer=tokenizer, + pos_tagger=tagger, + ) # PatterTagger takes raw text (not tokens), and handles tokenization itself. - assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'NN'), - (u'3.88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')]) - - @mock.patch('textblob.translate.Translator.translate') - def test_translate(self, mock_translate): - mock_translate.return_value = 'Esta es una frase.' - blob = tb.TextBlob("This is a sentence.") - translated = blob.translate(to="es") - assert_true(isinstance(translated, tb.TextBlob)) - assert_equal(translated, "Esta es una frase.") - mock_translate.return_value = 'This is a sentence.' - es_blob = tb.TextBlob("Esta es una frase.") - to_en = es_blob.translate(from_lang="es", to="en") - assert_equal(to_en, "This is a sentence.") - - @mock.patch('textblob.translate.Translator.detect') - def test_detect(self, mock_detect): - mock_detect.return_value = 'es' - es_blob = tb.TextBlob("Hola") - assert_equal(es_blob.detect_language(), "es") - assert_true(mock_detect.called_once_with('Hola')) + assert blob.tags == [ + ("Good", "JJ"), + ("muffins", "NNS"), + ("cost", "NN"), + ("3.88", "CD"), + ("in", "IN"), + ("New", "NNP"), + ("York", "NNP"), + ] def test_correct(self): blob = tb.TextBlob("I havv bad speling.") - assert_true(isinstance(blob.correct(), tb.TextBlob)) - assert_equal(blob.correct(), tb.TextBlob("I have bad spelling.")) + assert isinstance(blob.correct(), tb.TextBlob) + assert blob.correct() == tb.TextBlob("I have bad spelling.") blob2 = tb.TextBlob("I am so exciited!!!") - assert_equal(blob2.correct(), "I am so excited!!!") + assert blob2.correct() == "I am so excited!!!" blob3 = tb.TextBlob("The meaning of life is 42.0.") - assert_equal(blob3.correct(), "The meaning of life is 42.0.") + assert blob3.correct() == "The meaning of life is 42.0." blob4 = tb.TextBlob("?") - assert_equal(blob4.correct(), "?") + assert blob4.correct() == "?" blob5 = tb.TextBlob("I can't spel") - assert_equal(blob5.correct(), "I can't spell") + assert blob5.correct() == "I can't spell" blob6 = tb.TextBlob("I cann't \nspel") - assert_equal(blob6.correct(), "I can't \nspell") + assert blob6.correct() == "I can't \nspell" # From a user-submitted bug - text = "Before you embark on any of this journey, write a quick " + \ - "high-level test that demonstrates the slowness. " + \ - "You may need to introduce some minimum set of data to " + \ - "reproduce a significant enough slowness." + text = ( + "Before you embark on any of this journey, write a quick " + + "high-level test that demonstrates the slowness. " + + "You may need to introduce some minimum set of data to " + + "reproduce a significant enough slowness." + ) blob5 = tb.TextBlob(text) - assert_equal(blob5.correct(), text) - text = "Word list! :\n" + \ - "\t* spelling\n" + \ - "\t* well" + assert blob5.correct() == text + text = "Word list! :\n" + "\t* spelling\n" + "\t* well" blob6 = tb.TextBlob(text) - assert_equal(blob6.correct(), text) + assert blob6.correct() == text def test_parse(self): blob = tb.TextBlob("And now for something completely different.") - assert_equal(blob.parse(), PatternParser().parse(blob.string)) + assert blob.parse() == PatternParser().parse(blob.string) def test_passing_bad_init_params(self): tagger = PatternTagger() - assert_raises(ValueError, - lambda: tb.TextBlob("blah", parser=tagger)) - assert_raises(ValueError, - lambda: tb.TextBlob("blah", np_extractor=tagger)) - assert_raises(ValueError, - lambda: tb.TextBlob("blah", tokenizer=tagger)) - assert_raises(ValueError, - lambda: tb.TextBlob("blah", analyzer=tagger)) - analyzer = PatternAnalyzer - assert_raises(ValueError, - lambda: tb.TextBlob("blah", pos_tagger=analyzer)) + with pytest.raises(ValueError): + tb.TextBlob("blah", parser=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", np_extractor=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", tokenizer=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", analyzer=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", pos_tagger=PatternAnalyzer) def test_classify(self): - blob = tb.TextBlob("This is an amazing library. What an awesome classifier!", - classifier=classifier) - assert_equal(blob.classify(), 'pos') + blob = tb.TextBlob( + "This is an amazing library. What an awesome classifier!", + classifier=classifier, + ) + assert blob.classify() == "pos" for s in blob.sentences: - assert_equal(s.classify(), 'pos') + assert s.classify() == "pos" def test_classify_without_classifier(self): blob = tb.TextBlob("This isn't gonna be good") - assert_raises(NameError, - lambda: blob.classify()) + with pytest.raises(NameError): + blob.classify() def test_word_string_type_after_pos_tags_is_str(self): - text = 'John is a cat' + text = "John is a cat" blob = tb.TextBlob(text) - for word, part_of_speech in blob.pos_tags: - assert type(word.string) is unicode + for word, _ in blob.pos_tags: + assert type(word.string) is str class WordTest(TestCase): - def setUp(self): - self.cat = tb.Word('cat') - self.cats = tb.Word('cats') + self.cat = tb.Word("cat") + self.cats = tb.Word("cats") def test_init(self): tb.Word("cat") - assert_true(isinstance(self.cat, tb.Word)) - word = tb.Word('cat', 'NN') - assert_equal(word.pos_tag, 'NN') + assert isinstance(self.cat, tb.Word) + word = tb.Word("cat", "NN") + assert word.pos_tag == "NN" def test_singularize(self): singular = self.cats.singularize() - assert_equal(singular, 'cat') - assert_equal(self.cat.singularize(), 'cat') - assert_true(isinstance(self.cat.singularize(), tb.Word)) + assert singular == "cat" + assert self.cat.singularize() == "cat" + assert isinstance(self.cat.singularize(), tb.Word) def test_pluralize(self): plural = self.cat.pluralize() - assert_equal(self.cat.pluralize(), 'cats') - assert_true(isinstance(plural, tb.Word)) + assert self.cat.pluralize() == "cats" + assert isinstance(plural, tb.Word) def test_repr(self): - assert_equal(repr(self.cat), repr("cat")) + assert repr(self.cat) == repr("cat") def test_str(self): - assert_equal(str(self.cat), 'cat') + assert str(self.cat) == "cat" def test_has_str_methods(self): - assert_equal(self.cat.upper(), "CAT") - assert_equal(self.cat.lower(), "cat") - assert_equal(self.cat[0:2], 'ca') - - @mock.patch('textblob.translate.Translator.translate') - def test_translate(self, mock_translate): - mock_translate.return_value = 'gato' - assert_equal(tb.Word("cat").translate(to="es"), "gato") - - @mock.patch('textblob.translate.Translator.translate') - def test_translate_without_from_lang(self, mock_translate): - mock_translate.return_value = 'hi' - assert_equal(tb.Word('hola').translate(), 'hi') - - @mock.patch('textblob.translate.Translator.detect') - def test_detect_language(self, mock_detect): - mock_detect.return_value = 'fr' - assert_equal(tb.Word("bonjour").detect_language(), 'fr') + assert self.cat.upper() == "CAT" + assert self.cat.lower() == "cat" + assert self.cat[0:2] == "ca" def test_spellcheck(self): blob = tb.Word("speling") suggestions = blob.spellcheck() - assert_equal(suggestions[0][0], "spelling") + assert suggestions[0][0] == "spelling" def test_spellcheck_special_cases(self): # Punctuation - assert_equal(tb.Word("!").spellcheck(), [("!", 1.0)]) + assert tb.Word("!").spellcheck() == [("!", 1.0)] # Numbers - assert_equal(tb.Word("42").spellcheck(), [("42", 1.0)]) - assert_equal(tb.Word("12.34").spellcheck(), [("12.34", 1.0)]) + assert tb.Word("42").spellcheck() == [("42", 1.0)] + assert tb.Word("12.34").spellcheck() == [("12.34", 1.0)] # One-letter words - assert_equal(tb.Word("I").spellcheck(), [("I", 1.0)]) - assert_equal(tb.Word("A").spellcheck(), [("A", 1.0)]) - assert_equal(tb.Word("a").spellcheck(), [("a", 1.0)]) + assert tb.Word("I").spellcheck() == [("I", 1.0)] + assert tb.Word("A").spellcheck() == [("A", 1.0)] + assert tb.Word("a").spellcheck() == [("a", 1.0)] def test_correct(self): - w = tb.Word('speling') + w = tb.Word("speling") correct = w.correct() - assert_equal(correct, tb.Word('spelling')) - assert_true(isinstance(correct, tb.Word)) + assert correct == tb.Word("spelling") + assert isinstance(correct, tb.Word) - @attr('slow') + @pytest.mark.slow def test_lemmatize(self): w = tb.Word("cars") - assert_equal(w.lemmatize(), "car") + assert w.lemmatize() == "car" w = tb.Word("wolves") - assert_equal(w.lemmatize(), "wolf") + assert w.lemmatize() == "wolf" w = tb.Word("went") - assert_equal(w.lemmatize("v"), "go") # wordnet tagset - assert_equal(w.lemmatize("VBD"), "go") # penn treebank tagset + assert w.lemmatize("v") == "go" # wordnet tagset + assert w.lemmatize("VBD") == "go" # penn treebank tagset def test_lemma(self): w = tb.Word("wolves") - assert_equal(w.lemma, "wolf") - w = tb.Word("went", "VBD"); - assert_equal(w.lemma, "go") + assert w.lemma == "wolf" + w = tb.Word("went", "VBD") + assert w.lemma == "go" - def test_stem(self): #only PorterStemmer tested + def test_stem(self): # only PorterStemmer tested w = tb.Word("cars") - assert_equal(w.stem(), "car") + assert w.stem() == "car" w = tb.Word("wolves") - assert_equal(w.stem(), "wolv") + assert w.stem() == "wolv" w = tb.Word("went") - assert_equal(w.stem(), "went") + assert w.stem() == "went" def test_synsets(self): w = tb.Word("car") - assert_true(isinstance(w.synsets, (list, tuple))) - assert_true(isinstance(w.synsets[0], Synset)) + assert isinstance(w.synsets, (list, tuple)) + assert isinstance(w.synsets[0], Synset) def test_synsets_with_pos_argument(self): w = tb.Word("work") noun_syns = w.get_synsets(pos=wn.NOUN) for synset in noun_syns: - assert_equal(synset.pos(), wn.NOUN) + assert synset.pos() == wn.NOUN def test_definitions(self): w = tb.Word("octopus") for definition in w.definitions: - print(type(definition)) - assert_true(isinstance(definition, basestring)) + assert isinstance(definition, str) def test_define(self): w = tb.Word("hack") synsets = w.get_synsets(wn.NOUN) definitions = w.define(wn.NOUN) - assert_equal(len(synsets), len(definitions)) + assert len(synsets) == len(definitions) class TestWordnetInterface(TestCase): - def setUp(self): pass def test_synset(self): syn = wn.Synset("dog.n.01") word = tb.Word("dog") - assert_equal(word.synsets[0], syn) + assert word.synsets[0] == syn def test_lemma(self): - lemma = wn.Lemma('eat.v.01.eat') + lemma = wn.Lemma("eat.v.01.eat") word = tb.Word("eat") - assert_equal(word.synsets[0].lemmas()[0], lemma) + assert word.synsets[0].lemmas()[0] == lemma class BlobberTest(TestCase): - def setUp(self): self.blobber = tb.Blobber() # The default blobber def test_creates_blobs(self): blob1 = self.blobber("this is one blob") - assert_true(isinstance(blob1, tb.TextBlob)) + assert isinstance(blob1, tb.TextBlob) blob2 = self.blobber("another blob") - assert_equal(blob1.pos_tagger, blob2.pos_tagger) + assert blob1.pos_tagger == blob2.pos_tagger def test_default_tagger(self): blob = self.blobber("Some text") - assert_true(isinstance(blob.pos_tagger, NLTKTagger)) + assert isinstance(blob.pos_tagger, NLTKTagger) def test_default_np_extractor(self): blob = self.blobber("Some text") - assert_true(isinstance(blob.np_extractor, FastNPExtractor)) + assert isinstance(blob.np_extractor, FastNPExtractor) def test_default_tokenizer(self): blob = self.blobber("Some text") - assert_true(isinstance(blob.tokenizer, WordTokenizer)) + assert isinstance(blob.tokenizer, WordTokenizer) def test_str_and_repr(self): - expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=NLTKTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)" - assert_equal(repr(self.blobber), expected) - assert_equal(str(self.blobber), repr(self.blobber)) + expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=NLTKTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)" # noqa: E501 + assert repr(self.blobber) == expected + assert str(self.blobber) == repr(self.blobber) def test_overrides(self): - b = tb.Blobber(tokenizer=SentenceTokenizer(), - np_extractor=ConllExtractor()) + b = tb.Blobber(tokenizer=SentenceTokenizer(), np_extractor=ConllExtractor()) blob = b("How now? Brown cow?") - assert_true(isinstance(blob.tokenizer, SentenceTokenizer)) - assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"])) + assert isinstance(blob.tokenizer, SentenceTokenizer) + assert blob.tokens == tb.WordList(["How now?", "Brown cow?"]) blob2 = b("Another blob") # blobs have the same tokenizer - assert_true(blob.tokenizer is blob2.tokenizer) + assert blob.tokenizer is blob2.tokenizer # but aren't the same object - assert_not_equal(blob, blob2) + assert blob != blob2 def test_override_analyzer(self): b = tb.Blobber(analyzer=NaiveBayesAnalyzer()) blob = b("How now?") blob2 = b("Brown cow") - assert_true(isinstance(blob.analyzer, NaiveBayesAnalyzer)) - assert_true(blob.analyzer is blob2.analyzer) + assert isinstance(blob.analyzer, NaiveBayesAnalyzer) + assert blob.analyzer is blob2.analyzer def test_overrider_classifier(self): b = tb.Blobber(classifier=classifier) blob = b("I am so amazing") - assert_equal(blob.classify(), 'pos') + assert blob.classify() == "pos" + def is_blob(obj): return isinstance(obj, tb.TextBlob) - -if __name__ == '__main__': - main() diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index 9db1c6ba..a0bc9109 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -1,99 +1,106 @@ -# -*- coding: utf-8 -*- import os import unittest +from unittest import mock -import mock -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr import nltk +import pytest -from textblob.tokenizers import WordTokenizer -from textblob.classifiers import (NaiveBayesClassifier, DecisionTreeClassifier, - basic_extractor, contains_extractor, NLTKClassifier, - PositiveNaiveBayesClassifier, _get_words_from_dataset, - MaxEntClassifier) from textblob import formats -from textblob.compat import unicode +from textblob.classifiers import ( + DecisionTreeClassifier, + MaxEntClassifier, + NaiveBayesClassifier, + NLTKClassifier, + PositiveNaiveBayesClassifier, + _get_words_from_dataset, + basic_extractor, + contains_extractor, +) from textblob.exceptions import FormatError +from textblob.tokenizers import WordTokenizer HERE = os.path.abspath(os.path.dirname(__file__)) -CSV_FILE = os.path.join(HERE, 'data.csv') +CSV_FILE = os.path.join(HERE, "data.csv") JSON_FILE = os.path.join(HERE, "data.json") TSV_FILE = os.path.join(HERE, "data.tsv") train_set = [ - ('I love this car', 'positive'), - ('This view is amazing', 'positive'), - ('I feel great this morning', 'positive'), - ('I am so excited about the concert', 'positive'), - ('He is my best friend', 'positive'), - ('I do not like this car', 'negative'), - ('This view is horrible', 'negative'), - ('I feel tired this morning', 'negative'), - ('I am not looking forward to the concert', 'negative'), - ('He is my enemy', 'negative') + ("I love this car", "positive"), + ("This view is amazing", "positive"), + ("I feel great this morning", "positive"), + ("I am so excited about the concert", "positive"), + ("He is my best friend", "positive"), + ("I do not like this car", "negative"), + ("This view is horrible", "negative"), + ("I feel tired this morning", "negative"), + ("I am not looking forward to the concert", "negative"), + ("He is my enemy", "negative"), +] + +test_set = [ + ("I feel happy this morning", "positive"), + ("Larry is my friend.", "positive"), + ("I do not like that man.", "negative"), + ("My house is not great.", "negative"), + ("Your song is annoying.", "negative"), ] -test_set = [('I feel happy this morning', 'positive'), - ('Larry is my friend.', 'positive'), - ('I do not like that man.', 'negative'), - ('My house is not great.', 'negative'), - ('Your song is annoying.', 'negative')] class BadNLTKClassifier(NLTKClassifier): - '''An NLTK classifier without ``nltk_class`` defined. Oops!''' + """An NLTK classifier without ``nltk_class`` defined. Oops!""" + pass -class TestNLTKClassifier(unittest.TestCase): +class TestNLTKClassifier(unittest.TestCase): def setUp(self): self.bad_classifier = BadNLTKClassifier(train_set) def test_raises_value_error_without_nltk_class(self): - assert_raises(ValueError, - lambda: self.bad_classifier.classifier) + with pytest.raises(ValueError): + self.bad_classifier.classifier # noqa: B018 - assert_raises(ValueError, - lambda: self.bad_classifier.train(train_set)) + with pytest.raises(ValueError): + self.bad_classifier.train(train_set) - assert_raises(ValueError, - lambda: self.bad_classifier.update([("This is no good.", 'negative')])) + with pytest.raises(ValueError): + self.bad_classifier.update([("This is no good.", "negative")]) class TestNaiveBayesClassifier(unittest.TestCase): - def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." - assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) + assert self.classifier.extract_features(text) == basic_extractor( + text, train_set + ) def test_classify(self): res = self.classifier.classify("I feel happy this morning") - assert_equal(res, 'positive') - assert_equal(len(self.classifier.train_set), len(train_set)) + assert res == "positive" + assert len(self.classifier.train_set) == len(train_set) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) - assert_equal(res, "positive") + assert res == "positive" def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) - assert_equal(classifier.accuracy(test_set), - self.classifier.accuracy(test_set)) + assert classifier.accuracy(test_set) == self.classifier.accuracy(test_set) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") - assert_equal(res.max(), "positive") - assert_true(res.prob("positive") > res.prob("negative")) + assert res.max() == "positive" + assert res.prob("positive") > res.prob("negative") def test_accuracy(self): acc = self.classifier.accuracy(test_set) - assert_true(isinstance(acc, float)) + assert isinstance(acc, float) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") @@ -101,57 +108,57 @@ def test_update(self): self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") - assert_true(res2.prob("positive") > res1.prob("positive")) - assert_equal(original_length + 1, new_length) + assert res2.prob("positive") > res1.prob("positive") + assert original_length + 1 == new_length def test_labels(self): labels = self.classifier.labels() - assert_true("positive" in labels) - assert_true("negative" in labels) + assert "positive" in labels + assert "negative" in labels def test_show_informative_features(self): - feats = self.classifier.show_informative_features() + self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) - assert_true(isinstance(feats, list)) - assert_true(isinstance(feats[0], tuple)) + assert isinstance(feats, list) + assert isinstance(feats[0], tuple) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") - assert_equal(cl.train_features[0][1], 'positive') + assert cl.train_features[0][1] == "positive" def test_init_with_csv_file(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp, format="csv") - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_csv_file_without_format_specifier(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp) - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_json_file(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp, format="json") - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_json_file_without_format_specifier(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp) - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_custom_format(self): - redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')] + redis_train = [("I like turtles", "pos"), ("I hate turtles", "neg")] class MockRedisFormat(formats.BaseFormat): def __init__(self, client, port): @@ -165,140 +172,148 @@ def detect(cls, stream): def to_iterable(self): return redis_train - formats.register('redis', MockRedisFormat) + formats.register("redis", MockRedisFormat) mock_redis = mock.Mock() - cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234) - assert_equal(cl.train_set, redis_train) + cl = NaiveBayesClassifier(mock_redis, format="redis", port=1234) + assert cl.train_set == redis_train def test_data_with_no_available_format(self): mock_fp = mock.Mock() - mock_fp.read.return_value = '' + mock_fp.read.return_value = "" - assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp)) + with pytest.raises(FormatError): + NaiveBayesClassifier(mock_fp) def test_accuracy_on_a_csv_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) - assert_equal(type(a), float) + assert type(a) == float def test_accuracy_on_json_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) - assert_equal(type(a), float) + assert type(a) == float def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_bad_format_specifier(self): - assert_raises(ValueError, - lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) + with pytest.raises(ValueError): + NaiveBayesClassifier(CSV_FILE, format="unknown") def test_repr(self): - assert_equal(repr(self.classifier), - "".format(len(train_set))) + assert ( + repr(self.classifier) + == f"" + ) class TestDecisionTreeClassifier(unittest.TestCase): - def setUp(self): self.classifier = DecisionTreeClassifier(train_set) def test_classify(self): res = self.classifier.classify("I feel happy this morning") - assert_equal(res, 'positive') - assert_equal(len(self.classifier.train_set), len(train_set)) + assert res == "positive" + assert len(self.classifier.train_set) == len(train_set) def test_accuracy(self): acc = self.classifier.accuracy(test_set) - assert_true(isinstance(acc, float)) + assert isinstance(acc, float) def test_update(self): original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) - assert_equal(original_length + 1, new_length) + assert original_length + 1 == new_length def test_custom_feature_extractor(self): cl = DecisionTreeClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") - assert_equal(cl.train_features[0][1], 'positive') + assert cl.train_features[0][1] == "positive" def test_pseudocode(self): code = self.classifier.pseudocode() - assert_true("if" in code) + assert "if" in code def test_pretty_format(self): pp = self.classifier.pprint(width=60) pf = self.classifier.pretty_format(width=60) - assert_true(isinstance(pp, unicode)) - assert_equal(pp, pf) + assert isinstance(pp, str) + assert pp == pf def test_repr(self): - assert_equal(repr(self.classifier), - "".format(len(train_set))) + assert ( + repr(self.classifier) + == f"" + ) -@attr('requires_numpy') -@attr('slow') -class TestMaxEntClassifier(unittest.TestCase): +@pytest.mark.numpy +@pytest.mark.slow +class TestMaxEntClassifier(unittest.TestCase): def setUp(self): self.classifier = MaxEntClassifier(train_set) def test_classify(self): res = self.classifier.classify("I feel happy this morning") - assert_equal(res, 'positive') - assert_equal(len(self.classifier.train_set), len(train_set)) + assert res == "positive" + assert len(self.classifier.train_set) == len(train_set) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") - assert_equal(res.max(), 'positive') - assert_true(res.prob("positive") > res.prob("negative")) - + assert res.max() == "positive" + assert res.prob("positive") > res.prob("negative") class TestPositiveNaiveBayesClassifier(unittest.TestCase): - def setUp(self): - sports_sentences = ['The team dominated the game', - 'They lost the ball', - 'The game was intense', - 'The goalkeeper catched the ball', - 'The other team controlled the ball' - 'The ball went off the court', - 'They had the ball for the whole game'] - - various_sentences = ['The President did not comment', - 'I lost the keys', - 'The team won the game', - 'Sara has two kids', - 'The show is over', - 'The cat ate the mouse.'] - - self.classifier = PositiveNaiveBayesClassifier(positive_set=sports_sentences, - unlabeled_set=various_sentences) + sports_sentences = [ + "The team dominated the game", + "They lost the ball", + "The game was intense", + "The goalkeeper catched the ball", + "The other team controlled the ball" "The ball went off the court", + "They had the ball for the whole game", + ] - def test_classifier(self): - assert_true(isinstance(self.classifier.classifier, - nltk.classify.PositiveNaiveBayesClassifier)) + various_sentences = [ + "The President did not comment", + "I lost the keys", + "The team won the game", + "Sara has two kids", + "The show is over", + "The cat ate the mouse.", + ] + + self.classifier = PositiveNaiveBayesClassifier( + positive_set=sports_sentences, unlabeled_set=various_sentences + ) + def test_classifier(self): + assert isinstance( + self.classifier.classifier, nltk.classify.PositiveNaiveBayesClassifier + ) def test_classify(self): - assert_true(self.classifier.classify("My team lost the game.")) - assert_false(self.classifier.classify("The cat is on the table.")) + assert self.classifier.classify("My team lost the game.") + assert not self.classifier.classify("The cat is on the table.") def test_update(self): orig_pos_length = len(self.classifier.positive_set) orig_unlabeled_length = len(self.classifier.unlabeled_set) - self.classifier.update(new_positive_data=['He threw the ball to the base.'], - new_unlabeled_data=["I passed a tree today."]) + self.classifier.update( + new_positive_data=["He threw the ball to the base."], + new_unlabeled_data=["I passed a tree today."], + ) new_pos_length = len(self.classifier.positive_set) new_unlabeled_length = len(self.classifier.unlabeled_set) - assert_equal(new_pos_length, orig_pos_length + 1) - assert_equal(new_unlabeled_length, orig_unlabeled_length + 1) + assert new_pos_length == orig_pos_length + 1 + assert new_unlabeled_length == orig_unlabeled_length + 1 def test_accuracy(self): test_set = [ @@ -306,64 +321,70 @@ def test_accuracy(self): ("The ball was in the court.", True), ("We should have won the game.", True), ("And now for something completely different", False), - ("I can't believe it's not butter.", False) + ("I can't believe it's not butter.", False), ] accuracy = self.classifier.accuracy(test_set) - assert_true(isinstance(accuracy, float)) + assert isinstance(accuracy, float) def test_repr(self): - assert_equal(repr(self.classifier), - "" - .format(len(self.classifier.positive_set), - len(self.classifier.unlabeled_set)) - ) + assert ( + repr(self.classifier) + == "".format( # noqa: E501 + len(self.classifier.positive_set), len(self.classifier.unlabeled_set) + ) + ) def test_basic_extractor(): text = "I feel happy this morning." feats = basic_extractor(text, train_set) - assert_true(feats["contains(feel)"]) - assert_true(feats['contains(morning)']) - assert_false(feats["contains(amazing)"]) + assert feats["contains(feel)"] + assert feats["contains(morning)"] + assert not feats["contains(amazing)"] + def test_basic_extractor_with_list(): text = "I feel happy this morning.".split() feats = basic_extractor(text, train_set) - assert_true(feats["contains(feel)"]) - assert_true(feats['contains(morning)']) - assert_false(feats["contains(amazing)"]) + assert feats["contains(feel)"] + assert feats["contains(morning)"] + assert not feats["contains(amazing)"] + def test_contains_extractor_with_string(): text = "Simple is better than complex" features = contains_extractor(text) - assert_true(features["contains(Simple)"]) - assert_false(features.get('contains(simple)', False)) - assert_true(features['contains(complex)']) - assert_false(features.get("contains(derp)", False)) + assert features["contains(Simple)"] + assert not features.get("contains(simple)", False) + assert features["contains(complex)"] + assert not features.get("contains(derp)", False) + def test_contains_extractor_with_list(): text = ["Simple", "is", "better", "than", "complex"] features = contains_extractor(text) - assert_true(features['contains(Simple)']) - assert_false(features.get("contains(simple)", False)) - assert_true(features['contains(complex)']) - assert_false(features.get("contains(derp)", False)) + assert features["contains(Simple)"] + assert not features.get("contains(simple)", False) + assert features["contains(complex)"] + assert not features.get("contains(derp)", False) + def custom_extractor(document): feats = {} tokens = document.split() for tok in tokens: - feat_name = "last_letter({0})".format(tok[-1]) + feat_name = f"last_letter({tok[-1]})" feats[feat_name] = True return feats + def test_get_words_from_dataset(): tok = WordTokenizer() all_words = [] for words, _ in train_set: all_words.extend(tok.itokenize(words, include_punc=False)) - assert_equal(_get_words_from_dataset(train_set), set(all_words)) + assert _get_words_from_dataset(train_set) == set(all_words) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_decorators.py b/tests/test_decorators.py index e36af252..cd974348 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -1,14 +1,12 @@ -# -*- coding: utf-8 -*- import unittest -from nose.plugins.attrib import attr -from nose.tools import * # PEP8 asserts + +import pytest from textblob.decorators import requires_nltk_corpus from textblob.exceptions import MissingCorpusError -class Tokenizer(object): - +class Tokenizer: @requires_nltk_corpus def tag(self, text): raise LookupError @@ -16,7 +14,9 @@ def tag(self, text): def test_decorator_raises_missing_corpus_exception(): t = Tokenizer() - assert_raises(MissingCorpusError, lambda: t.tag('hello world')) + with pytest.raises(MissingCorpusError): + t.tag("hello world") + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_formats.py b/tests/test_formats.py index 957829e2..4130232f 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -1,107 +1,103 @@ -# -*- coding: utf-8 -*- import os import unittest -from nose.tools import * # noqa (PEP8 asserts) from textblob import formats -from textblob.compat import unicode HERE = os.path.abspath(os.path.dirname(__file__)) -CSV_FILE = os.path.join(HERE, 'data.csv') +CSV_FILE = os.path.join(HERE, "data.csv") JSON_FILE = os.path.join(HERE, "data.json") TSV_FILE = os.path.join(HERE, "data.tsv") -class TestFormats(unittest.TestCase): +class TestFormats(unittest.TestCase): def setUp(self): pass def test_detect_csv(self): with open(CSV_FILE) as fp: format = formats.detect(fp) - assert_equal(format, formats.CSV) + assert format == formats.CSV def test_detect_json(self): with open(JSON_FILE) as fp: format = formats.detect(fp) - assert_equal(format, formats.JSON) + assert format == formats.JSON def test_available(self): registry = formats.get_registry() - assert_true('csv' in registry.keys()) - assert_true('json' in registry.keys()) - assert_true('tsv' in registry.keys()) + assert "csv" in registry.keys() + assert "json" in registry.keys() + assert "tsv" in registry.keys() -class TestDelimitedFormat(unittest.TestCase): +class TestDelimitedFormat(unittest.TestCase): def test_delimiter_defaults_to_comma(self): - assert_equal(formats.DelimitedFormat.delimiter, ",") + assert formats.DelimitedFormat.delimiter == "," def test_detect(self): - with open(CSV_FILE, 'r') as fp: + with open(CSV_FILE) as fp: stream = fp.read() - assert_true(formats.DelimitedFormat.detect(stream)) - with open(JSON_FILE, 'r') as fp: + assert formats.DelimitedFormat.detect(stream) + with open(JSON_FILE) as fp: stream = fp.read() - assert_false(formats.DelimitedFormat.detect(stream)) + assert not formats.DelimitedFormat.detect(stream) -class TestCSV(unittest.TestCase): +class TestCSV(unittest.TestCase): def test_read_from_filename(self): with open(CSV_FILE) as fp: - data = formats.CSV(fp) + formats.CSV(fp) def test_detect(self): - with open(CSV_FILE, 'r') as fp: + with open(CSV_FILE) as fp: stream = fp.read() - assert_true(formats.CSV.detect(stream)) - with open(JSON_FILE, 'r') as fp: + assert formats.CSV.detect(stream) + with open(JSON_FILE) as fp: stream = fp.read() - assert_false(formats.CSV.detect(stream)) + assert not formats.CSV.detect(stream) -class TestTSV(unittest.TestCase): +class TestTSV(unittest.TestCase): def test_read_from_file_object(self): with open(TSV_FILE) as fp: - data = formats.TSV(fp) + formats.TSV(fp) def test_detect(self): - with open(TSV_FILE, 'r') as fp: + with open(TSV_FILE) as fp: stream = fp.read() - assert_true(formats.TSV.detect(stream)) + assert formats.TSV.detect(stream) - with open(CSV_FILE, 'r') as fp: + with open(CSV_FILE) as fp: stream = fp.read() - assert_false(formats.TSV.detect(stream)) + assert not formats.TSV.detect(stream) -class TestJSON(unittest.TestCase): +class TestJSON(unittest.TestCase): def test_read_from_file_object(self): with open(JSON_FILE) as fp: formats.JSON(fp) def test_detect(self): - with open(JSON_FILE, 'r') as fp: + with open(JSON_FILE) as fp: stream = fp.read() - assert_true(formats.JSON.detect(stream)) - with open(CSV_FILE, 'r') as fp: + assert formats.JSON.detect(stream) + with open(CSV_FILE) as fp: stream = fp.read() - assert_false(formats.JSON.detect(stream)) + assert not formats.JSON.detect(stream) def test_to_iterable(self): with open(JSON_FILE) as fp: d = formats.JSON(fp) data = d.to_iterable() first = data[0] - text, label = first[0], first[1] - assert_true(isinstance(text, unicode)) + text, _label = first[0], first[1] + assert isinstance(text, str) + class CustomFormat(formats.BaseFormat): def to_iterable(): - return [ - ('I like turtles', 'pos'), - ('I hate turtles', 'neg') - ] + return [("I like turtles", "pos"), ("I hate turtles", "neg")] + @classmethod def detect(cls, stream): return True @@ -113,13 +109,13 @@ def setUp(self): def test_register(self): registry = formats.get_registry() - assert_false(CustomFormat in registry.values()) + assert CustomFormat not in registry.values() - formats.register('trt', CustomFormat) + formats.register("trt", CustomFormat) - assert_true(CustomFormat in registry.values()) - assert_true('trt' in registry.keys()) + assert CustomFormat in registry.values() + assert "trt" in registry.keys() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_inflect.py b/tests/test_inflect.py index 6631a643..0a2eecc7 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -1,41 +1,36 @@ -from nose.tools import assert_equals, assert_true from unittest import TestCase - from textblob.en.inflect import ( plural_categories, + pluralize, singular_ie, singular_irregular, - singular_uncountable, - singular_uninflected, singularize, - pluralize ) class InflectTestCase(TestCase): - def s_singular_pluralize_test(self): - assert_equals(pluralize('lens'), 'lenses') + assert pluralize("lens") == "lenses" def s_singular_singularize_test(self): - assert_equals(singularize('lenses'), 'lens') + assert singularize("lenses") == "lens" def diagnoses_singularize_test(self): - assert_equals(singularize('diagnoses'), 'diagnosis') + assert singularize("diagnoses") == "diagnosis" def bus_pluralize_test(self): - assert_equals(pluralize('bus'), 'buses') + assert pluralize("bus") == "buses" def test_all_singular_s(self): - for w in plural_categories['s-singular']: - assert_equals(singularize(pluralize(w)), w) + for w in plural_categories["s-singular"]: + assert singularize(pluralize(w)) == w def test_all_singular_ie(self): for w in singular_ie: - assert_true(pluralize(w).endswith('ies')) - assert_equals(singularize(pluralize(w)), w) + assert pluralize(w).endswith("ies") + assert singularize(pluralize(w)) == w def test_all_singular_irregular(self): for singular_w in singular_irregular.values(): - assert_equals(singular_irregular[pluralize(singular_w)], singular_w) + assert singular_irregular[pluralize(singular_w)] == singular_w diff --git a/tests/test_np_extractor.py b/tests/test_np_extractor.py index ad4cdefc..b70675ee 100644 --- a/tests/test_np_extractor.py +++ b/tests/test_np_extractor.py @@ -1,9 +1,7 @@ -from __future__ import unicode_literals import unittest -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr import nltk +import pytest from textblob.base import BaseNPExtractor from textblob.np_extractors import ConllExtractor @@ -11,48 +9,51 @@ class TestConllExtractor(unittest.TestCase): - def setUp(self): self.extractor = ConllExtractor() - self.text = ''' + self.text = """ Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer lines of code than would be possible in other languages. The language provides constructs intended to enable clear programs on both a small and large scale. -''' - self.sentence = "Python is a widely used general-purpose, high-level programming language" +""" + self.sentence = ( + "Python is a widely used general-purpose, high-level programming language" + ) - @attr('slow') + @pytest.mark.slow def test_extract(self): noun_phrases = self.extractor.extract(self.text) - assert_true("Python" in noun_phrases) - assert_true("design philosophy" in noun_phrases) - assert_true("code readability" in noun_phrases) + assert "Python" in noun_phrases + assert "design philosophy" in noun_phrases + assert "code readability" in noun_phrases - @attr('slow') + @pytest.mark.slow def test_parse_sentence(self): parsed = self.extractor._parse_sentence(self.sentence) - assert_true(isinstance(parsed, nltk.tree.Tree)) + assert isinstance(parsed, nltk.tree.Tree) - @attr('slow') + @pytest.mark.slow def test_filter_insignificant(self): chunk = self.extractor._parse_sentence(self.sentence) tags = [tag for word, tag in chunk.leaves()] - assert_true('DT' in tags) + assert "DT" in tags filtered = filter_insignificant(chunk.leaves()) tags = [tag for word, tag in filtered] - assert_true("DT" not in tags) + assert "DT" not in tags class BadExtractor(BaseNPExtractor): - '''An extractor without an extract method. How useless.''' + """An extractor without an extract method. How useless.""" + pass def test_cannot_instantiate_incomplete_extractor(): - assert_raises(TypeError, - lambda: BadExtractor()) + with pytest.raises(TypeError): + BadExtractor() + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_parsers.py b/tests/test_parsers.py index f38fd125..54c84f99 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -1,20 +1,17 @@ -# -*- coding: utf-8 -*- import unittest -from nose.tools import * # PEP8 asserts -from textblob.parsers import PatternParser from textblob.en import parse as pattern_parse +from textblob.parsers import PatternParser class TestPatternParser(unittest.TestCase): - def setUp(self): self.parser = PatternParser() self.text = "And now for something completely different." def test_parse(self): - assert_equal(self.parser.parse(self.text), pattern_parse(self.text)) + assert self.parser.parse(self.text) == pattern_parse(self.text) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_sentiments.py b/tests/test_sentiments.py index feb55dc6..8286d482 100644 --- a/tests/test_sentiments.py +++ b/tests/test_sentiments.py @@ -1,68 +1,72 @@ -from __future__ import unicode_literals import unittest -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr -from textblob.sentiments import PatternAnalyzer, NaiveBayesAnalyzer, DISCRETE, CONTINUOUS +import pytest +from textblob.sentiments import ( + CONTINUOUS, + DISCRETE, + NaiveBayesAnalyzer, + PatternAnalyzer, +) -class TestPatternSentiment(unittest.TestCase): +class TestPatternSentiment(unittest.TestCase): def setUp(self): self.analyzer = PatternAnalyzer() def test_kind(self): - assert_equal(self.analyzer.kind, CONTINUOUS) + assert self.analyzer.kind == CONTINUOUS def test_analyze(self): p1 = "I feel great this morning." n1 = "This is a terrible car." p1_result = self.analyzer.analyze(p1) n1_result = self.analyzer.analyze(n1) - assert_true(p1_result[0] > 0) - assert_true(n1_result[0] < 0) - assert_equal(p1_result.polarity, p1_result[0]) - assert_equal(p1_result.subjectivity, p1_result[1]) + assert p1_result[0] > 0 + assert n1_result[0] < 0 + assert p1_result.polarity == p1_result[0] + assert p1_result.subjectivity == p1_result[1] def test_analyze_assessments(self): p1 = "I feel great this morning." n1 = "This is a terrible car." - p1_result = self.analyzer.analyze(p1,keep_assessments=True) - n1_result = self.analyzer.analyze(n1,keep_assessments=True) + p1_result = self.analyzer.analyze(p1, keep_assessments=True) + n1_result = self.analyzer.analyze(n1, keep_assessments=True) p1_assessment = p1_result.assessments[0] n1_assessment = n1_result.assessments[0] - assert_true(p1_assessment[1] > 0) - assert_true(n1_assessment[1] < 0) - assert_equal(p1_result.polarity, p1_assessment[1]) - assert_equal(p1_result.subjectivity, p1_assessment[2]) + assert p1_assessment[1] > 0 + assert n1_assessment[1] < 0 + assert p1_result.polarity == p1_assessment[1] + assert p1_result.subjectivity == p1_assessment[2] -class TestNaiveBayesAnalyzer(unittest.TestCase): +class TestNaiveBayesAnalyzer(unittest.TestCase): def setUp(self): self.analyzer = NaiveBayesAnalyzer() def test_kind(self): - assert_equal(self.analyzer.kind, DISCRETE) + assert self.analyzer.kind == DISCRETE - @attr('slow') + @pytest.mark.slow def test_analyze(self): - p1 = 'I feel great this morning.' - n1 = 'This is a terrible car.' + p1 = "I feel great this morning." + n1 = "This is a terrible car." p1_result = self.analyzer.analyze(p1) - assert_equal(p1_result[0], 'pos') - assert_equal(self.analyzer.analyze(n1)[0], 'neg') + assert p1_result[0] == "pos" + assert self.analyzer.analyze(n1)[0] == "neg" # The 2nd item should be the probability that it is positive - assert_true(isinstance(p1_result[1], float)) + assert isinstance(p1_result[1], float) # 3rd item is probability that it is negative - assert_true(isinstance(p1_result[2], float)) + assert isinstance(p1_result[2], float) assert_about_equal(p1_result[1] + p1_result[2], 1) - assert_equal(p1_result.classification, p1_result[0]) - assert_equal(p1_result.p_pos, p1_result[1]) - assert_equal(p1_result.p_neg, p1_result[2]) + assert p1_result.classification == p1_result[0] + assert p1_result.p_pos == p1_result[1] + assert p1_result.p_neg == p1_result[2] def assert_about_equal(first, second, places=4): - return assert_equal(round(first, places), second) + assert round(first, places) == second + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_taggers.py b/tests/test_taggers.py index 7dc52cb2..07895604 100644 --- a/tests/test_taggers.py +++ b/tests/test_taggers.py @@ -1,62 +1,80 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals import os import unittest -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr -from textblob.base import BaseTagger +import pytest + import textblob.taggers +from textblob.base import BaseTagger HERE = os.path.abspath(os.path.dirname(__file__)) -AP_MODEL_LOC = os.path.join(HERE, 'trontagger.pickle') +AP_MODEL_LOC = os.path.join(HERE, "trontagger.pickle") class TestPatternTagger(unittest.TestCase): - def setUp(self): - self.text = ("Simple is better than complex. " - "Complex is better than complicated.") + self.text = ( + "Simple is better than complex. " "Complex is better than complicated." + ) self.tagger = textblob.taggers.PatternTagger() def test_init(self): tagger = textblob.taggers.PatternTagger() - assert_true(isinstance(tagger, textblob.taggers.BaseTagger)) + assert isinstance(tagger, textblob.taggers.BaseTagger) def test_tag(self): tags = self.tagger.tag(self.text) - assert_equal(tags, - [('Simple', 'JJ'), ('is', 'VBZ'), ('better', 'JJR'), - ('than', 'IN'), ('complex', 'JJ'), ('.', '.'), - ('Complex', 'NNP'), ('is', 'VBZ'), ('better', 'JJR'), - ('than', 'IN'), ('complicated', 'VBN'), ('.', '.')]) + assert tags == [ + ("Simple", "JJ"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complex", "JJ"), + (".", "."), + ("Complex", "NNP"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complicated", "VBN"), + (".", "."), + ] -@attr("slow") -@attr("no_pypy") -@attr("requires_numpy") +@pytest.mark.slow +@pytest.mark.numpy class TestNLTKTagger(unittest.TestCase): - def setUp(self): - self.text = ("Simple is better than complex. " - "Complex is better than complicated.") + self.text = ( + "Simple is better than complex. " "Complex is better than complicated." + ) self.tagger = textblob.taggers.NLTKTagger() def test_tag(self): tags = self.tagger.tag(self.text) - assert_equal(tags, - [('Simple', 'NN'), ('is', 'VBZ'), - ('better', 'JJR'), ('than', 'IN'), - ('complex', 'JJ'), ('.', '.'), ('Complex', 'NNP'), - ('is', 'VBZ'), ('better', 'JJR'), - ('than', 'IN'), ('complicated', 'VBN'), ('.', '.')]) + assert tags == [ + ("Simple", "NN"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complex", "JJ"), + (".", "."), + ("Complex", "NNP"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complicated", "VBN"), + (".", "."), + ] def test_cannot_instantiate_incomplete_tagger(): class BadTagger(BaseTagger): - '''A tagger without a tag method. How useless.''' + """A tagger without a tag method. How useless.""" + pass - assert_raises(TypeError, lambda: BadTagger()) -if __name__ == '__main__': + with pytest.raises(TypeError): + BadTagger() + + +if __name__ == "__main__": unittest.main() diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index f99cc686..0e704948 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -1,21 +1,20 @@ -# -*- coding: utf-8 -*- import unittest -from nose.plugins.attrib import attr -from nose.tools import * # PEP8 asserts -from textblob.tokenizers import WordTokenizer, SentenceTokenizer, word_tokenize, sent_tokenize -from textblob.compat import PY2 +import pytest + +from textblob.tokenizers import ( + SentenceTokenizer, + WordTokenizer, + sent_tokenize, + word_tokenize, +) def is_generator(obj): - if PY2: - return hasattr(obj, 'next') - else: - return hasattr(obj, '__next__') + return hasattr(obj, "__next__") class TestWordTokenizer(unittest.TestCase): - def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." @@ -24,56 +23,71 @@ def tearDown(self): pass def test_tokenize(self): - assert_equal(self.tokenizer.tokenize(self.text), - ['Python', 'is', 'a', 'high-level', 'programming', - 'language', '.']) + assert self.tokenizer.tokenize(self.text) == [ + "Python", + "is", + "a", + "high-level", + "programming", + "language", + ".", + ] def test_exclude_punc(self): - assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), - ['Python', 'is', 'a', 'high-level', 'programming', - 'language']) + assert self.tokenizer.tokenize(self.text, include_punc=False) == [ + "Python", + "is", + "a", + "high-level", + "programming", + "language", + ] def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) - assert_equal(next(gen), "Python") - assert_equal(next(gen), "is") + assert next(gen) == "Python" + assert next(gen) == "is" def test_word_tokenize(self): tokens = word_tokenize(self.text) - assert_true(is_generator(tokens)) - assert_equal(list(tokens), self.tokenizer.tokenize(self.text)) + assert is_generator(tokens) + assert list(tokens) == self.tokenizer.tokenize(self.text) class TestSentenceTokenizer(unittest.TestCase): - def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): - assert_equal(self.tokenizer.tokenize(self.text), - ["Beautiful is better than ugly.", "Simple is better than complex."]) + assert self.tokenizer.tokenize(self.text) == [ + "Beautiful is better than ugly.", + "Simple is better than complex.", + ] - @attr("skip") # This is a known problem with the sentence tokenizer. + @pytest.mark.skip # This is a known problem with the sentence tokenizer. def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." - assert_equal(self.tokenizer.tokenize(text), - ["Hello world.", "How do you do?!", "My name's Steve..."]) - text2 = 'OMG! I am soooo LOL!!!' + assert self.tokenizer.tokenize(text) == [ + "Hello world.", + "How do you do?!", + "My name's Steve...", + ] + text2 = "OMG! I am soooo LOL!!!" tokens = self.tokenizer.tokenize(text2) - assert_equal(len(tokens), 2) - assert_equal(tokens, - ["OMG!", "I am soooo LOL!!!"]) + assert len(tokens) == 2 + assert tokens == ["OMG!", "I am soooo LOL!!!"] def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) - assert_equal(next(gen), "Beautiful is better than ugly.") - assert_equal(next(gen), "Simple is better than complex.") + assert next(gen) == "Beautiful is better than ugly." + assert next(gen) == "Simple is better than complex." def test_sent_tokenize(self): tokens = sent_tokenize(self.text) - assert_true(is_generator(tokens)) # It's a generator - assert_equal(list(tokens), self.tokenizer.tokenize(self.text)) + assert is_generator(tokens) # It's a generator + assert list(tokens) == self.tokenizer.tokenize(self.text) + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py index 01723ef4..32aff7dd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,34 +1,28 @@ -# -*- coding: utf-8 -*- - -from unittest import TestCase import os +from unittest import TestCase -from nose.tools import * # PEP8 asserts - -from textblob.utils import lowerstrip, strip_punc, is_filelike +from textblob.utils import is_filelike, lowerstrip, strip_punc HERE = os.path.abspath(os.path.dirname(__file__)) -CSV_FILE = os.path.join(HERE, 'data.csv') +CSV_FILE = os.path.join(HERE, "data.csv") + class UtilsTests(TestCase): def setUp(self): self.text = "this. Has. Punctuation?! " def test_strip_punc(self): - assert_equal(strip_punc(self.text), - 'this. Has. Punctuation') + assert strip_punc(self.text) == "this. Has. Punctuation" def test_strip_punc_all(self): - assert_equal(strip_punc(self.text, all=True), - 'this Has Punctuation') + assert strip_punc(self.text, all=True) == "this Has Punctuation" def test_lowerstrip(self): - assert_equal(lowerstrip(self.text), - 'this. has. punctuation') + assert lowerstrip(self.text) == "this. has. punctuation" def test_is_filelike(): with open(CSV_FILE) as fp: - assert_true(is_filelike(fp)) - assert_false(is_filelike('notafile')) - assert_false(is_filelike(12.3)) + assert is_filelike(fp) + assert not is_filelike("notafile") + assert not is_filelike(12.3) diff --git a/textblob/__init__.py b/textblob/__init__.py deleted file mode 100644 index 4af2b949..00000000 --- a/textblob/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -from .blob import TextBlob, Word, Sentence, Blobber, WordList - -__version__ = '0.17.1' -__license__ = 'MIT' -__author__ = 'Steven Loria' - -PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__)) - -__all__ = [ - 'TextBlob', - 'Word', - 'Sentence', - 'Blobber', - 'WordList', -] diff --git a/textblob/compat.py b/textblob/compat.py deleted file mode 100644 index bf384dd1..00000000 --- a/textblob/compat.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -import sys - -PY2 = int(sys.version[0]) == 2 - -if PY2: - from itertools import imap, izip - import urllib2 as request - from urllib import quote as urlquote - from urllib import urlencode - text_type = unicode - binary_type = str - string_types = (str, unicode) - unicode = unicode - basestring = basestring - imap = imap - izip = izip - import unicodecsv as csv - - def implements_to_string(cls): - """Class decorator that renames __str__ to __unicode__ and - modifies __str__ that returns utf-8. - """ - cls.__unicode__ = cls.__str__ - cls.__str__ = lambda x: x.__unicode__().encode('utf-8') - return cls -else: # PY3 - from urllib import request - from urllib.parse import quote as urlquote - from urllib.parse import urlencode - text_type = str - binary_type = bytes - string_types = (str,) - unicode = str - basestring = (str, bytes) - imap = map - izip = zip - import csv - - implements_to_string = lambda x: x - - -# From six -def with_metaclass(meta, *bases): - """Create a base class with a metaclass.""" - # This requires a bit of explanation: the basic idea is to make a dummy - # metaclass for one level of class instantiation that replaces itself with - # the actual metaclass. - class metaclass(meta): # noqa - - def __new__(cls, name, this_bases, d): - return meta(name, bases, d) - return type.__new__(metaclass, 'temporary_class', (), {}) diff --git a/textblob/en/__init__.py b/textblob/en/__init__.py deleted file mode 100644 index 5479eb14..00000000 --- a/textblob/en/__init__.py +++ /dev/null @@ -1,139 +0,0 @@ -# -*- coding: utf-8 -*- -'''This file is based on pattern.en. See the bundled NOTICE file for -license information. -''' -from __future__ import absolute_import -import os - -from textblob._text import (Parser as _Parser, Sentiment as _Sentiment, Lexicon, - WORD, POS, CHUNK, PNP, PENN, UNIVERSAL, Spelling) - -from textblob.compat import text_type, unicode - -try: - MODULE = os.path.dirname(os.path.abspath(__file__)) -except: - MODULE = "" - -spelling = Spelling( - path = os.path.join(MODULE, "en-spelling.txt") -) - -#--- ENGLISH PARSER -------------------------------------------------------------------------------- - -def find_lemmata(tokens): - """ Annotates the tokens with lemmata for plural nouns and conjugated verbs, - where each token is a [word, part-of-speech] list. - """ - for token in tokens: - word, pos, lemma = token[0], token[1], token[0] - # cats => cat - if pos == "NNS": - lemma = singularize(word) - # sat => sit - if pos.startswith(("VB", "MD")): - lemma = conjugate(word, INFINITIVE) or word - token.append(lemma.lower()) - return tokens - -class Parser(_Parser): - - def find_lemmata(self, tokens, **kwargs): - return find_lemmata(tokens) - - def find_tags(self, tokens, **kwargs): - if kwargs.get("tagset") in (PENN, None): - kwargs.setdefault("map", lambda token, tag: (token, tag)) - if kwargs.get("tagset") == UNIVERSAL: - kwargs.setdefault("map", lambda token, tag: penntreebank2universal(token, tag)) - return _Parser.find_tags(self, tokens, **kwargs) - -class Sentiment(_Sentiment): - - def load(self, path=None): - _Sentiment.load(self, path) - # Map "terrible" to adverb "terribly" (+1% accuracy) - if not path: - for w, pos in list(dict.items(self)): - if "JJ" in pos: - if w.endswith("y"): - w = w[:-1] + "i" - if w.endswith("le"): - w = w[:-2] - p, s, i = pos["JJ"] - self.annotate(w + "ly", "RB", p, s, i) - - -lexicon = Lexicon( - path = os.path.join(MODULE, "en-lexicon.txt"), - morphology = os.path.join(MODULE, "en-morphology.txt"), - context = os.path.join(MODULE, "en-context.txt"), - entities = os.path.join(MODULE, "en-entities.txt"), - language = "en" -) -parser = Parser( - lexicon = lexicon, - default = ("NN", "NNP", "CD"), - language = "en" -) - -sentiment = Sentiment( - path = os.path.join(MODULE, "en-sentiment.xml"), - synset = "wordnet_id", - negations = ("no", "not", "n't", "never"), - modifiers = ("RB",), - modifier = lambda w: w.endswith("ly"), - tokenizer = parser.find_tokens, - language = "en" -) - - -def tokenize(s, *args, **kwargs): - """ Returns a list of sentences, where punctuation marks have been split from words. - """ - return parser.find_tokens(text_type(s), *args, **kwargs) - -def parse(s, *args, **kwargs): - """ Returns a tagged Unicode string. - """ - return parser.parse(unicode(s), *args, **kwargs) - -def parsetree(s, *args, **kwargs): - """ Returns a parsed Text from the given string. - """ - return Text(parse(unicode(s), *args, **kwargs)) - -def split(s, token=[WORD, POS, CHUNK, PNP]): - """ Returns a parsed Text from the given parsed string. - """ - return Text(text_type(s), token) - -def tag(s, tokenize=True, encoding="utf-8"): - """ Returns a list of (token, tag)-tuples from the given string. - """ - tags = [] - for sentence in parse(s, tokenize, True, False, False, False, encoding).split(): - for token in sentence: - tags.append((token[0], token[1])) - return tags - -def suggest(w): - """ Returns a list of (word, confidence)-tuples of spelling corrections. - """ - return spelling.suggest(w) - -def polarity(s, **kwargs): - """ Returns the sentence polarity (positive/negative) between -1.0 and 1.0. - """ - return sentiment(unicode(s), **kwargs)[0] - -def subjectivity(s, **kwargs): - """ Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0. - """ - return sentiment(unicode(s), **kwargs)[1] - -def positive(s, threshold=0.1, **kwargs): - """ Returns True if the given sentence has a positive sentiment (polarity >= threshold). - """ - return polarity(unicode(s), **kwargs) >= threshold - diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py deleted file mode 100644 index f66c7e2c..00000000 --- a/textblob/en/inflect.py +++ /dev/null @@ -1,472 +0,0 @@ -# -*- coding: utf-8 -*- -'''The pluralize and singular methods from the pattern library. - -Licenced under the BSD. -See here https://github.com/clips/pattern/blob/master/LICENSE.txt for -complete license information. -''' -import re - -VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB" - -#### PLURALIZE ##################################################################################### -# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway: -# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html - -# Prepositions are used to solve things like -# "mother-in-law" or "man at arms" -plural_prepositions = [ - "about", "above", "across", "after", "among", "around", "at", "athwart", "before", "behind", - "below", "beneath", "beside", "besides", "between", "betwixt", "beyond", "but", "by", "during", - "except", "for", "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over", - "since", "till", "to", "under", "until", "unto", "upon", "with" -] - -# Inflection rules that are either general, -# or apply to a certain category of words, -# or apply to a certain category of words only in classical mode, -# or apply only in classical mode. -# Each rule consists of: -# suffix, inflection, category and classic flag. -plural_rules = [ - # 0) Indefinite articles and demonstratives. - [["^a$|^an$", "some", None, False], - ["^this$", "these", None, False], - ["^that$", "those", None, False], - ["^any$", "all", None, False] - ], - # 1) Possessive adjectives. - # Overlaps with 1/ for "his" and "its". - # Overlaps with 2/ for "her". - [["^my$", "our", None, False], - ["^your$|^thy$", "your", None, False], - ["^her$|^his$|^its$|^their$", "their", None, False] - ], - # 2) Possessive pronouns. - [["^mine$", "ours", None, False], - ["^yours$|^thine$", "yours", None, False], - ["^hers$|^his$|^its$|^theirs$", "theirs", None, False] - ], - # 3) Personal pronouns. - [["^I$", "we", None, False], - ["^me$", "us", None, False], - ["^myself$", "ourselves", None, False], - ["^you$", "you", None, False], - ["^thou$|^thee$", "ye", None, False], - ["^yourself$|^thyself$", "yourself", None, False], - ["^she$|^he$|^it$|^they$", "they", None, False], - ["^her$|^him$|^it$|^them$", "them", None, False], - ["^herself$|^himself$|^itself$|^themself$", "themselves", None, False], - ["^oneself$", "oneselves", None, False] - ], - # 4) Words that do not inflect. - [["$", "", "uninflected", False], - ["$", "", "uncountable", False], - ["fish$", "fish", None, False], - ["([- ])bass$", "\\1bass", None, False], - ["ois$", "ois", None, False], - ["sheep$", "sheep", None, False], - ["deer$", "deer", None, False], - ["pox$", "pox", None, False], - ["([A-Z].*)ese$", "\\1ese", None, False], - ["itis$", "itis", None, False], - ["(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False] - ], - # 5) Irregular plurals (mongoose, oxen). - [["atlas$", "atlantes", None, True], - ["atlas$", "atlases", None, False], - ["beef$", "beeves", None, True], - ["brother$", "brethren", None, True], - ["child$", "children", None, False], - ["corpus$", "corpora", None, True], - ["corpus$", "corpuses", None, False], - ["^cow$", "kine", None, True], - ["ephemeris$", "ephemerides", None, False], - ["ganglion$", "ganglia", None, True], - ["genie$", "genii", None, True], - ["genus$", "genera", None, False], - ["graffito$", "graffiti", None, False], - ["loaf$", "loaves", None, False], - ["money$", "monies", None, True], - ["mongoose$", "mongooses", None, False], - ["mythos$", "mythoi", None, False], - ["octopus$", "octopodes", None, True], - ["opus$", "opera", None, True], - ["opus$", "opuses", None, False], - ["^ox$", "oxen", None, False], - ["penis$", "penes", None, True], - ["penis$", "penises", None, False], - ["soliloquy$", "soliloquies", None, False], - ["testis$", "testes", None, False], - ["trilby$", "trilbys", None, False], - ["turf$", "turves", None, True], - ["numen$", "numena", None, False], - ["occiput$", "occipita", None, True] - ], - # 6) Irregular inflections for common suffixes (synopses, mice, men). - [["man$", "men", None, False], - ["person$", "people", None, False], - ["([lm])ouse$", "\\1ice", None, False], - ["tooth$", "teeth", None, False], - ["goose$", "geese", None, False], - ["foot$", "feet", None, False], - ["zoon$", "zoa", None, False], - ["([csx])is$", "\\1es", None, False] - ], - # 7) Fully assimilated classical inflections (vertebrae, codices). - [["ex$", "ices", "ex-ices", False], - ["ex$", "ices", "ex-ices-classical", True], - ["um$", "a", "um-a", False], - ["um$", "a", "um-a-classical", True], - ["on$", "a", "on-a", False], - ["a$", "ae", "a-ae", False], - ["a$", "ae", "a-ae-classical", True] - ], - # 8) Classical variants of modern inflections (stigmata, soprani). - [["trix$", "trices", None, True], - ["eau$", "eaux", None, True], - ["ieu$", "ieu", None, True], - ["([iay])nx$", "\\1nges", None, True], - ["en$", "ina", "en-ina-classical", True], - ["a$", "ata", "a-ata-classical", True], - ["is$", "ides", "is-ides-classical", True], - ["us$", "i", "us-i-classical", True], - ["us$", "us", "us-us-classical", True], - ["o$", "i", "o-i-classical", True], - ["$", "i", "-i-classical", True], - ["$", "im", "-im-classical", True] - ], - # 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses). - [["([cs])h$", "\\1hes", None, False], - ["ss$", "sses", None, False], - ["x$", "xes", None, False], - ["s$", "ses", "s-singular", False] - ], - # 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves). - [["([aeo]l)f$", "\\1ves", None, False], - ["([^d]ea)f$", "\\1ves", None, False], - ["arf$", "arves", None, False], - ["([nlw]i)fe$", "\\1ves", None, False], - ], - # 11) -y takes -ys if preceded by a vowel or when a proper noun, - # but -ies if preceded by a consonant (storeys, Marys, stories). - [["([aeiou])y$", "\\1ys", None, False], - ["([A-Z].*)y$", "\\1ys", None, False], - ["y$", "ies", None, False] - ], - # 12) Some words ending in -o take -os, the rest take -oes. - # Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos). - [["o$", "os", "o-os", False], - ["([aeiou])o$", "\\1os", None, False], - ["o$", "oes", None, False] - ], - # 13) Miltary stuff (Major Generals). - [["l$", "ls", "general-generals", False] - ], - # 14) Otherwise, assume that the plural just adds -s (cats, programmes). - [["$", "s", None, False] - ], -] - -# For performance, compile the regular expressions only once: -for ruleset in plural_rules: - for rule in ruleset: - rule[0] = re.compile(rule[0]) - -# Suffix categories. -plural_categories = { - "uninflected": [ - "aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis", - "clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "elk", - "flounder", "gallows", "graffiti", "headquarters", "herpes", "high-jinks", "homework", "innings", - "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "offspring", "news", "pincers", - "pliers", "proceedings", "rabies", "salmon", "scissors", "series", "shears", "species", "swine", - "trout", "tuna", "whiting", "wildebeest"], - "uncountable": [ - "advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture", - "garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage", - "mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", - "sand", "software", "understanding", "water"], - "s-singular": [ - "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas", - "chaos", "christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", - "ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros", - "sassafras", "trellis"], - "ex-ices": ["codex", "murex", "silex"], - "ex-ices-classical": [ - "apex", "cortex", "index", "latex", "pontifex", "simplex", "vertex", "vortex"], - "um-a": [ - "agendum", "bacterium", "candelabrum", "datum", "desideratum", "erratum", "extremum", - "ovum", "stratum"], - "um-a-classical": [ - "aquarium", "compendium", "consortium", "cranium", "curriculum", "dictum", "emporium", - "enconium", "gymnasium", "honorarium", "interregnum", "lustrum", "maximum", "medium", - "memorandum", "millenium", "minimum", "momentum", "optimum", "phylum", "quantum", "rostrum", - "spectrum", "speculum", "stadium", "trapezium", "ultimatum", "vacuum", "velum"], - "on-a": [ - "aphelion", "asyndeton", "criterion", "hyperbaton", "noumenon", "organon", "perihelion", - "phenomenon", "prolegomenon"], - "a-ae": ["alga", "alumna", "vertebra"], - "a-ae-classical": [ - "abscissa", "amoeba", "antenna", "aurora", "formula", "hydra", "hyperbola", "lacuna", - "medusa", "nebula", "nova", "parabola"], - "en-ina-classical": ["foramen", "lumen", "stamen"], - "a-ata-classical": [ - "anathema", "bema", "carcinoma", "charisma", "diploma", "dogma", "drama", "edema", "enema", - "enigma", "gumma", "lemma", "lymphoma", "magma", "melisma", "miasma", "oedema", "sarcoma", - "schema", "soma", "stigma", "stoma", "trauma"], - "is-ides-classical": ["clitoris", "iris"], - "us-i-classical": [ - "focus", "fungus", "genius", "incubus", "nimbus", "nucleolus", "radius", "stylus", "succubus", - "torus", "umbilicus", "uterus"], - "us-us-classical": [ - "apparatus", "cantus", "coitus", "hiatus", "impetus", "nexus", "plexus", "prospectus", - "sinus", "status"], - "o-i-classical": ["alto", "basso", "canto", "contralto", "crescendo", "solo", "soprano", "tempo"], - "-i-classical": ["afreet", "afrit", "efreet"], - "-im-classical": ["cherub", "goy", "seraph"], - "o-os": [ - "albino", "archipelago", "armadillo", "commando", "ditto", "dynamo", "embryo", "fiasco", - "generalissimo", "ghetto", "guano", "inferno", "jumbo", "lingo", "lumbago", "magneto", - "manifesto", "medico", "octavo", "photo", "pro", "quarto", "rhino", "stylo"], - "general-generals": [ - "Adjutant", "Brigadier", "Lieutenant", "Major", "Quartermaster", - "adjutant", "brigadier", "lieutenant", "major", "quartermaster"], -} - -def pluralize(word, pos=NOUN, custom={}, classical=True): - """ Returns the plural of a given word. - For example: child -> children. - Handles nouns and adjectives, using classical inflection by default - (e.g. where "matrix" pluralizes to "matrices" instead of "matrixes"). - The custom dictionary is for user-defined replacements. - """ - - if word in custom: - return custom[word] - - # Recursion of genitives. - # Remove the apostrophe and any trailing -s, - # form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs'). - if word.endswith("'") or word.endswith("'s"): - owner = word.rstrip("'s") - owners = pluralize(owner, pos, custom, classical) - if owners.endswith("s"): - return owners + "'" - else: - return owners + "'s" - - # Recursion of compound words - # (Postmasters General, mothers-in-law, Roman deities). - words = word.replace("-", " ").split(" ") - if len(words) > 1: - if words[1] == "general" or words[1] == "General" and \ - words[0] not in plural_categories["general-generals"]: - return word.replace(words[0], pluralize(words[0], pos, custom, classical)) - elif words[1] in plural_prepositions: - return word.replace(words[0], pluralize(words[0], pos, custom, classical)) - else: - return word.replace(words[-1], pluralize(words[-1], pos, custom, classical)) - - # Only a very few number of adjectives inflect. - n = list(range(len(plural_rules))) - if pos.startswith(ADJECTIVE): - n = [0, 1] - - # Apply pluralization rules. - for i in n: - ruleset = plural_rules[i] - for rule in ruleset: - suffix, inflection, category, classic = rule - # A general rule, or a classic rule in classical mode. - if category == None: - if not classic or (classic and classical): - if suffix.search(word) is not None: - return suffix.sub(inflection, word) - # A rule relating to a specific category of words. - if category != None: - if word in plural_categories[category] and (not classic or (classic and classical)): - if suffix.search(word) is not None: - return suffix.sub(inflection, word) - -#### SINGULARIZE ################################################################################### -# Adapted from Bermi Ferrer's Inflector for Python: -# http://www.bermi.org/inflector/ - -# Copyright (c) 2006 Bermi Ferrer Martinez -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software to deal in this software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of this software, and to permit -# persons to whom this software is furnished to do so, subject to the following -# condition: -# -# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THIS SOFTWARE. - -singular_rules = [ - ['(?i)(.)ae$', '\\1a'], - ['(?i)(.)itis$', '\\1itis'], - ['(?i)(.)eaux$', '\\1eau'], - ['(?i)(quiz)zes$', '\\1'], - ['(?i)(matr)ices$', '\\1ix'], - ['(?i)(ap|vert|ind)ices$', '\\1ex'], - ['(?i)^(ox)en', '\\1'], - ['(?i)(alias|status)es$', '\\1'], - ['(?i)([octop|vir])i$', '\\1us'], - ['(?i)(cris|ax|test)es$', '\\1is'], - ['(?i)(shoe)s$', '\\1'], - ['(?i)(o)es$', '\\1'], - ['(?i)(bus)es$', '\\1'], - ['(?i)([m|l])ice$', '\\1ouse'], - ['(?i)(x|ch|ss|sh)es$', '\\1'], - ['(?i)(m)ovies$', '\\1ovie'], - ['(?i)(.)ombies$', '\\1ombie'], - ['(?i)(s)eries$', '\\1eries'], - ['(?i)([^aeiouy]|qu)ies$', '\\1y'], - # Certain words ending in -f or -fe take -ves in the plural (lives, wolves). - ["([aeo]l)ves$", "\\1f"], - ["([^d]ea)ves$", "\\1f"], - ["arves$", "arf"], - ["erves$", "erve"], - ["([nlw]i)ves$", "\\1fe"], - ['(?i)([lr])ves$', '\\1f'], - ["([aeo])ves$", "\\1ve"], - ['(?i)(sive)s$', '\\1'], - ['(?i)(tive)s$', '\\1'], - ['(?i)(hive)s$', '\\1'], - ['(?i)([^f])ves$', '\\1fe'], - # -es suffix. - ['(?i)(^analy)ses$', '\\1sis'], - ['(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'], - ['(?i)(.)opses$', '\\1opsis'], - ['(?i)(.)yses$', '\\1ysis'], - ['(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'], - ['(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'], - ['(?i)(.)oses$', '\\1osis'], - # -a - ['(?i)([ti])a$', '\\1um'], - ['(?i)(n)ews$', '\\1ews'], - ['(?i)s$', ''], -] - -# For performance, compile the regular expressions only once: -for rule in singular_rules: - rule[0] = re.compile(rule[0]) - -singular_uninflected = [ - "aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis", - "clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", - "elk", "flounder", "gallows", "georgia", "graffiti", "headquarters", "herpes", "high-jinks", - "homework", "innings", "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "news", - "offspring", "pincers", "pliers", "proceedings", "rabies", "salmon", "scissors", "series", - "shears", "species", "swine", "swiss", "trout", "tuna", "whiting", "wildebeest" -] -singular_uncountable = [ - "advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture", - "garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage", - "mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand", - "software", "understanding", "water" -] -singular_ie = [ - "algerie", "auntie", "beanie", "birdie", "bogie", "bombie", "bookie", "collie", "cookie", "cutie", - "doggie", "eyrie", "freebie", "goonie", "groupie", "hankie", "hippie", "hoagie", "hottie", - "indie", "junkie", "laddie", "laramie", "lingerie", "meanie", "nightie", "oldie", "^pie", - "pixie", "quickie", "reverie", "rookie", "softie", "sortie", "stoolie", "sweetie", "techie", - "^tie", "toughie", "valkyrie", "veggie", "weenie", "yuppie", "zombie" -] -singular_s = plural_categories['s-singular'] - -# key plural, value singular -singular_irregular = { - "men": "man", - "people": "person", - "children": "child", - "sexes": "sex", - "axes": "axe", - "moves": "move", - "teeth": "tooth", - "geese": "goose", - "feet": "foot", - "zoa": "zoon", - "atlantes": "atlas", - "atlases": "atlas", - "beeves": "beef", - "brethren": "brother", - "children": "child", - "corpora": "corpus", - "corpuses": "corpus", - "kine": "cow", - "ephemerides": "ephemeris", - "ganglia": "ganglion", - "genii": "genie", - "genera": "genus", - "graffiti": "graffito", - "helves": "helve", - "leaves": "leaf", - "loaves": "loaf", - "monies": "money", - "mongooses": "mongoose", - "mythoi": "mythos", - "octopodes": "octopus", - "opera": "opus", - "opuses": "opus", - "oxen": "ox", - "penes": "penis", - "penises": "penis", - "soliloquies": "soliloquy", - "testes": "testis", - "trilbys": "trilby", - "turves": "turf", - "numena": "numen", - "occipita": "occiput", - "our": "my", -} - -def singularize(word, pos=NOUN, custom={}): - - if word in list(custom.keys()): - return custom[word] - - # Recursion of compound words (e.g. mothers-in-law). - if "-" in word: - words = word.split("-") - if len(words) > 1 and words[1] in plural_prepositions: - return singularize(words[0], pos, custom)+"-"+"-".join(words[1:]) - # dogs' => dog's - if word.endswith("'"): - return singularize(word[:-1]) + "'s" - - lower = word.lower() - for w in singular_uninflected: - if w.endswith(lower): - return word - for w in singular_uncountable: - if w.endswith(lower): - return word - for w in singular_ie: - if lower.endswith(w+"s"): - return w - for w in singular_s: - if lower.endswith(w + 'es'): - return w - for w in list(singular_irregular.keys()): - if lower.endswith(w): - return re.sub('(?i)'+w+'$', singular_irregular[w], word) - - for rule in singular_rules: - suffix, inflection = rule - match = suffix.search(word) - if match: - groups = match.groups() - for k in range(0, len(groups)): - if groups[k] == None: - inflection = inflection.replace('\\'+str(k+1), '') - return suffix.sub(inflection, word) - - return word diff --git a/textblob/inflect.py b/textblob/inflect.py deleted file mode 100644 index fb5f1955..00000000 --- a/textblob/inflect.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -'''Make word inflection default to English. This allows for backwards -compatibility so you can still import text.inflect. - - >>> from textblob.inflect import singularize - -is equivalent to - - >>> from textblob.en.inflect import singularize -''' -from __future__ import absolute_import -from textblob.en.inflect import singularize, pluralize - -__all__ = [ - 'singularize', - 'pluralize', -] diff --git a/textblob/sentiments.py b/textblob/sentiments.py deleted file mode 100644 index 9c7a28bd..00000000 --- a/textblob/sentiments.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -'''Default sentiment analyzers are English for backwards compatibility, so -you can still do - ->>> from textblob.sentiments import PatternAnalyzer - -which is equivalent to - ->>> from textblob.en.sentiments import PatternAnalyzer -''' -from __future__ import absolute_import -from textblob.base import BaseSentimentAnalyzer -from textblob.en.sentiments import (DISCRETE, CONTINUOUS, - PatternAnalyzer, NaiveBayesAnalyzer) - -__all__ = [ - 'BaseSentimentAnalyzer', - 'DISCRETE', - 'CONTINUOUS', - 'PatternAnalyzer', - 'NaiveBayesAnalyzer', -] diff --git a/textblob/taggers.py b/textblob/taggers.py deleted file mode 100644 index 521adfc2..00000000 --- a/textblob/taggers.py +++ /dev/null @@ -1,18 +0,0 @@ -'''Default taggers to the English taggers for backwards incompatibility, so you -can still do - ->>> from textblob.taggers import NLTKTagger - -which is equivalent to - ->>> from textblob.en.taggers import NLTKTagger -''' -from __future__ import absolute_import -from textblob.base import BaseTagger -from textblob.en.taggers import PatternTagger, NLTKTagger - -__all__ = [ - 'BaseTagger', - 'PatternTagger', - 'NLTKTagger', -] diff --git a/textblob/translate.py b/textblob/translate.py deleted file mode 100644 index f01ce963..00000000 --- a/textblob/translate.py +++ /dev/null @@ -1,149 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Translator module that uses the Google Translate API. - -Adapted from Terry Yin's google-translate-python. -Language detection added by Steven Loria. -""" -from __future__ import absolute_import - -import codecs -import json -import re - -from textblob.compat import PY2, request, urlencode -from textblob.exceptions import TranslatorError, NotTranslated - - -class Translator(object): - - """A language translator and detector. - - Usage: - :: - >>> from textblob.translate import Translator - >>> t = Translator() - >>> t.translate('hello', from_lang='en', to_lang='fr') - u'bonjour' - >>> t.detect("hola") - u'es' - """ - - url = "http://translate.google.com/translate_a/t?client=webapp&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&otf=2&ssel=0&tsel=0&kc=1" - - headers = { - 'Accept': '*/*', - 'Connection': 'keep-alive', - 'User-Agent': ( - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) ' - 'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19') - } - - def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=None): - """Translate the source text from one language to another.""" - if PY2: - source = source.encode('utf-8') - data = {"q": source} - url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}&client={client}'.format( - url=self.url, - from_lang=from_lang, - to_lang=to_lang, - tk=_calculate_tk(source), - client="te", - ) - response = self._request(url, host=host, type_=type_, data=data) - result = json.loads(response) - if isinstance(result, list): - try: - result = result[0] # ignore detected language - except IndexError: - pass - self._validate_translation(source, result) - return result - - def detect(self, source, host=None, type_=None): - """Detect the source text's language.""" - if PY2: - source = source.encode('utf-8') - if len(source) < 3: - raise TranslatorError('Must provide a string with at least 3 characters.') - data = {"q": source} - url = u'{url}&sl=auto&tk={tk}&client={client}'.format( - url=self.url, - tk=_calculate_tk(source), - client="te", - ) - response = self._request(url, host=host, type_=type_, data=data) - result, language = json.loads(response) - return language - - def _validate_translation(self, source, result): - """Validate API returned expected schema, and that the translated text - is different than the original string. - """ - if not result: - raise NotTranslated('Translation API returned and empty response.') - if PY2: - result = result.encode('utf-8') - if result.strip() == source.strip(): - raise NotTranslated('Translation API returned the input string unchanged.') - - def _request(self, url, host=None, type_=None, data=None): - encoded_data = urlencode(data).encode('utf-8') - req = request.Request(url=url, headers=self.headers, data=encoded_data) - if host or type_: - req.set_proxy(host=host, type=type_) - resp = request.urlopen(req) - content = resp.read() - return content.decode('utf-8') - - -def _unescape(text): - """Unescape unicode character codes within a string. - """ - pattern = r'\\{1,2}u[0-9a-fA-F]{4}' - return re.sub(pattern, lambda x: codecs.getdecoder('unicode_escape')(x.group())[0], text) - - -def _calculate_tk(source): - """Reverse engineered cross-site request protection.""" - # Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715 - # Source: http://www.liuxiatool.com/t.php - - def c_int(x, nbits=32): - """ C cast to int32, int16, int8... """ - return (x & ((1 << (nbits - 1)) - 1)) - (x & (1 << (nbits - 1))) - - def c_uint(x, nbits=32): - """ C cast to uint32, uint16, uint8... """ - return x & ((1 << nbits) - 1) - - tkk = [406398, 561666268 + 1526272306] - b = tkk[0] - - if PY2: - d = map(ord, source) - else: - d = source.encode('utf-8') - - def RL(a, b): - for c in range(0, len(b) - 2, 3): - d = b[c + 2] - d = ord(d) - 87 if d >= 'a' else int(d) - xa = c_uint(a) - d = xa >> d if b[c + 1] == '+' else xa << d - a = a + d & 4294967295 if b[c] == '+' else a ^ d - return c_int(a) - - a = b - - for di in d: - a = RL(a + di, "+-a^+6") - - a = RL(a, "+-3^+b+-f") - a ^= tkk[1] - a = a if a >= 0 else ((a & 2147483647) + 2147483648) - a %= pow(10, 6) - - tk = '{0:d}.{1:d}'.format(a, a ^ b) - return tk diff --git a/tox.ini b/tox.ini index 374d8078..ea0be73c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,34 @@ [tox] -envlist =py27,py35,py36,py37,py38 +envlist = + lint + py{38,39,310,311,312} + py38-lowest + [testenv] -deps = -rdev-requirements.txt -commands= - python run_tests.py +extras = tests +deps = + lowest: nltk==3.8 +commands = pytest {posargs} + + +[testenv:lint] +deps = pre-commit~=3.5 +skip_install = true +commands = + pre-commit run --all-files + +[testenv:docs] +extras = docs +commands = sphinx-build docs/ docs/_build {posargs} + +; Below tasks are for development only (not run in CI) + +[testenv:watch-docs] +deps = sphinx-autobuild +extras = docs +commands = sphinx-autobuild --open-browser docs/ docs/_build {posargs} --watch src/textblob --delay 2 + +[testenv:watch-readme] +deps = restview +skip_install = true +commands = restview README.rst