From c1dc95854b740e099e3b28cf2eb5ca6d59df4d01 Mon Sep 17 00:00:00 2001 From: Dan Eads Date: Mon, 15 Jul 2019 21:33:41 -0400 Subject: [PATCH 1/2] 1.2.0 - Selenium support --- .gitignore | 3 ++ README.md | 68 +++++++++++++++++++++++++++++++++++++++----- dev-requirements.txt | 1 + pypatent/__init__.py | 58 +++++++++++++++++++++++++++++-------- requirements.txt | 3 +- setup.py | 4 +-- 6 files changed, 115 insertions(+), 22 deletions(-) create mode 100644 dev-requirements.txt diff --git a/.gitignore b/.gitignore index f91ff2b..dd8358c 100644 --- a/.gitignore +++ b/.gitignore @@ -214,3 +214,6 @@ pip-log.txt #Mr Developer .mr.developer.cfg + +IntelliJ +.idea \ No newline at end of file diff --git a/README.md b/README.md index 82e726a..9a7b6b3 100644 --- a/README.md +++ b/README.md @@ -4,16 +4,10 @@ pypatent is a tiny Python package to easily search for and scrape US Patent and [PyPI page](https://pypi.python.org/pypi/pypatent) -*New in version 1.1:* - -This version makes searching and storing patent data easier: -* Simplified to 2 objects: `Search` and `Patent` -* A `Search` object searches the USPTO site and can output the results as a DataFrame or list. It can scrape the details of each patent, or just get the patent title and URL. Most users will only need to use this object. -* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search. ## Requirements -Python 3, BeautifulSoup, requests, pandas, re +Python 3, BeautifulSoup, requests, pandas, re, selenium ## Installation @@ -21,6 +15,10 @@ Python 3, BeautifulSoup, requests, pandas, re pip install pypatent ``` +If using Selenium for scraping (introduced in version 1.2), be sure to install a Selenium WebDriver. +For Chrome, use `chromedriver`. For Firefox, use `geckodriver`. +See [the Selenium download page](https://www.seleniumhq.org/download/) for more details and options. + ## Searching for patents The Search object works similarly to the [Advanced Search at the USPTO](http://patft.uspto.gov/netahtml/PTO/search-adv.htm), with additional options. @@ -191,3 +189,59 @@ this_patent.fetch_details() * ILRD: International Registration Date * ILPD: International Registration Publication Date * ILFD: Hague International Filing Date + +### Changelog +#### New in version 1.2 + +This version implements Selenium support for scraping. +Previous versions were using the `requests` library for all requests, however the USPTO site has been causing problems for it. +I notice some users have been able to use `requests` without issue, while others get 4xx errors. + +PyPatent Version 1.2 implements a new WebConnection object to give the user the option to use Selenium WebDrivers in place of the `requests` library. +This WebConnection object is optional. +If used, it should be passed as an argument when initializing `Search` or `Patent` objects. +Use it in the following cases: +* When you want to use Selenium instead of `requests` +* When you want to use `requests` but with a custom user-agent or headers + +An example using the Firefox WebDriver: + +```python +import pypatent +from selenium import webdriver + +driver = webdriver.Firefox() # Requires geckodriver in your PATH + +conn = pypatent.WebConnection(use_selenium=True, selenium_driver=driver) + +res = pypatent.Search('microsoft', get_patent_details=True, web_connection=conn) + +print(res) +``` + +An example using the `requests` library with a custom user agent: +```python +import pypatent + +conn = pypatent.WebConnection(use_selenium=False, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36') + +res = pypatent.Search('microsoft', get_patent_details=True, web_connection=conn) + +print(res) +``` + +An example using the `requests` library with default user agent (WebConnection is not necessary here as we are using the defaults) +```python +import pypatent + +res = pypatent.Search('microsoft', get_patent_details=True) + +print(res) +``` + +#### New in version 1.1: + +This version makes searching and storing patent data easier: +* Simplified to 2 objects: `Search` and `Patent` +* A `Search` object searches the USPTO site and can output the results as a DataFrame or list. It can scrape the details of each patent, or just get the patent title and URL. Most users will only need to use this object. +* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search. \ No newline at end of file diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..55b033e --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1 @@ +pytest \ No newline at end of file diff --git a/pypatent/__init__.py b/pypatent/__init__.py index f6586ff..c68d0b0 100644 --- a/pypatent/__init__.py +++ b/pypatent/__init__.py @@ -4,15 +4,44 @@ import requests import re import pandas as pd +from selenium import webdriver -class Constants: - user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' - request_header = {'user-agent': user_agent} +class WebConnection: + def __init__(self, + use_selenium: bool = False, + selenium_driver: webdriver = None, + user_agent: str = None, + request_header: dict = None): + self.use_selenium = use_selenium + self.selenium_driver = selenium_driver + + if user_agent is None: + self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' + else: + self.user_agent = user_agent + if request_header is None: + self.request_header = {'user-agent': self.user_agent} + else: + self.request_header = request_header + + def get(self, url: str): + if self.use_selenium: + if self.selenium_driver is None: + raise ValueError('WebConnection.selenium_driver must point to a valid Selenium webdriver') + else: + self.selenium_driver.get(url) + return self.selenium_driver.page_source + else: + return requests.get(url, headers=self.request_header).text class Patent: - def __init__(self, title: str, url: str): + def __init__(self, title: str, url: str, web_connection: WebConnection = None): + if web_connection is not None: + self.web_connection = web_connection + else: + self.web_connection = WebConnection() self.title = title self.url = url self.fetched_details = False @@ -35,7 +64,7 @@ def __init__(self, title: str, url: str): def fetch_details(self): self.fetched_details = True - r = requests.get(self.url, headers=Constants.request_header).text + r = self.web_connection.get(self.url) s = BeautifulSoup(r, 'html.parser') try: self.patent_num = s.find(string='United States Patent ').find_next().text.replace('\n', '').strip() @@ -170,9 +199,10 @@ def __repr__(self): class Search: def __init__(self, - string=None, - results_limit=50, - get_patent_details=True, + string: str = None, + results_limit: int = 50, + get_patent_details: bool = True, + web_connection: WebConnection = None, pn=None, isd=None, ttl=None, @@ -229,7 +259,11 @@ def __init__(self, ilpd=None, ilfd=None): self.get_patent_details = get_patent_details - args = {k: str(v).replace(' ', '-') for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit]} + if web_connection is not None: + self.web_connection = web_connection + else: + self.web_connection = WebConnection() + args = {k: str(v).replace(' ', '-') for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit, web_connection]} searchstring = ' AND '.join(['%s/%s' % (key, value) for (key, value) in args.items() if key not in ['results_limit']]) searchstring = searchstring.replace('string/', '') searchstring = searchstring.replace(' ', '+') @@ -242,7 +276,7 @@ def __init__(self, base_url = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50&Query=' url = base_url + searchstring + '&d=PTXT' - r = requests.get(url, headers=Constants.request_header).text + r = self.web_connection.get(url) s = BeautifulSoup(r, 'html.parser') total_results = int(s.find(string=re.compile('out of')).find_next().text.strip()) @@ -272,7 +306,7 @@ def __init__(self, self.patents = patents def get_patents_from_results_url(self, url: str, limit: int = None) -> list: - r = requests.get(url, headers=Constants.request_header).text + r = self.web_connection.get(url) s = BeautifulSoup(r, 'html.parser') patents_raw = s.find_all('a', href=re.compile('netacgi')) patents_base_url = 'http://patft.uspto.gov' @@ -287,7 +321,7 @@ def get_patents_from_results_url(self, url: str, limit: int = None) -> list: patent_title = patents_raw_list[patent_num_idx + 1][0] patent_title = re.sub(' +', ' ', patent_title) patent_link = patents_raw_list[patent_num_idx][1] - p = Patent(patent_title, patent_link) + p = Patent(patent_title, patent_link, self.web_connection) if self.get_patent_details: p.fetch_details() patents.append(p) diff --git a/requirements.txt b/requirements.txt index afed453..2f10503 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ bs4 requests -pandas \ No newline at end of file +pandas +selenium \ No newline at end of file diff --git a/setup.py b/setup.py index 2a3ca03..4f44662 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = fh.read() setup(name='pypatent', - version='1.1.0', + version='1.2.0', description='Search and retrieve USPTO patent data', url='http://github.com/daneads/pypatent', author='Dan Eads', @@ -13,7 +13,7 @@ classifiers=['Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Topic :: Internet', 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', 'Programming Language :: Python :: 3'], keywords=['patent', 'uspto', 'scrape', 'scraping'], packages=['pypatent'], - install_requires=['bs4', 'requests', 'pandas'], + install_requires=['bs4', 'requests', 'pandas', 'selenium'], python_requires='>=3', zip_safe=False, long_description=long_description, From 41bb003d3c22ef87551113147eaa6c791e1d7005 Mon Sep 17 00:00:00 2001 From: Dan Eads Date: Mon, 15 Jul 2019 21:37:45 -0400 Subject: [PATCH 2/2] Clearer README --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 9a7b6b3..bf7a97b 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,21 @@ pypatent is a tiny Python package to easily search for and scrape US Patent and [PyPI page](https://pypi.python.org/pypi/pypatent) +#### New in version 1.2 + +This version implements Selenium support for scraping. +Previous versions were using the `requests` library for all requests, however this has had problems with the USPTO site lately. +I notice some users have been able to use `requests` without issue, while others get 4xx errors. + +PyPatent Version 1.2 implements an optional new WebConnection object to give the user the option to use Selenium WebDrivers in place of the `requests` library. +This WebConnection object is optional. +If used, it should be passed as an argument when initializing `Search` or `Patent` objects. + +Use it in the following cases: +* When you want to use Selenium instead of `requests` +* When you want to use `requests` but with a custom user-agent or headers + +See bottom of README for examples. ## Requirements