Merge pull request #9 from daneads/develop

1.2.0 - Selenium support
daneads · Jul 16, 2019 · fee12cc · fee12cc
2 parents e6072ee + 41bb003
commit fee12cc
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -214,3 +214,6 @@ pip-log.txt
 
 #Mr Developer
 .mr.developer.cfg
+
+IntelliJ
+.idea
diff --git a/README.md b/README.md
@@ -4,23 +4,36 @@ pypatent is a tiny Python package to easily search for and scrape US Patent and
 
 [PyPI page](https://pypi.python.org/pypi/pypatent)
 
-*New in version 1.1:*
+#### New in version 1.2
 
-This version makes searching and storing patent data easier:
-* Simplified to 2 objects: `Search` and `Patent`
-* A `Search` object searches the USPTO site and can output the results as a DataFrame or list. It can scrape the details of each patent, or just get the patent title and URL. Most users will only need to use this object.
-* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search.
+This version implements Selenium support for scraping.
+Previous versions were using the `requests` library for all requests, however this has had problems with the USPTO site lately.
+I notice some users have been able to use `requests` without issue, while others get 4xx errors.
+
+PyPatent Version 1.2 implements an optional new WebConnection object to give the user the option to use Selenium WebDrivers in place of the `requests` library.
+This WebConnection object is optional.
+If used, it should be passed as an argument when initializing `Search` or `Patent` objects.
+
+Use it in the following cases:
+* When you want to use Selenium instead of `requests`
+* When you want to use `requests` but with a custom user-agent or headers
+
+See bottom of README for examples.
 
 ## Requirements
 
-Python 3, BeautifulSoup, requests, pandas, re
+Python 3, BeautifulSoup, requests, pandas, re, selenium
 
 ## Installation
 
 ```
 pip install pypatent
 ```
 
+If using Selenium for scraping (introduced in version 1.2), be sure to install a Selenium WebDriver.
+For Chrome, use `chromedriver`. For Firefox, use `geckodriver`.
+See [the Selenium download page](https://www.seleniumhq.org/download/) for more details and options.
+
 ## Searching for patents
 
 The Search object works similarly to the [Advanced Search at the USPTO](http://patft.uspto.gov/netahtml/PTO/search-adv.htm), with additional options.
@@ -191,3 +204,59 @@ this_patent.fetch_details()
 * ILRD: International Registration Date
 * ILPD: International Registration Publication Date
 * ILFD: Hague International Filing Date
+
+### Changelog
+#### New in version 1.2
+
+This version implements Selenium support for scraping.
+Previous versions were using the `requests` library for all requests, however the USPTO site has been causing problems for it.
+I notice some users have been able to use `requests` without issue, while others get 4xx errors.
+
+PyPatent Version 1.2 implements a new WebConnection object to give the user the option to use Selenium WebDrivers in place of the `requests` library.
+This WebConnection object is optional.
+If used, it should be passed as an argument when initializing `Search` or `Patent` objects.
+Use it in the following cases:
+* When you want to use Selenium instead of `requests`
+* When you want to use `requests` but with a custom user-agent or headers
+
+An example using the Firefox WebDriver:
+
+```python
+import pypatent
+from selenium import webdriver
+
+driver = webdriver.Firefox()  # Requires geckodriver in your PATH
+
+conn = pypatent.WebConnection(use_selenium=True, selenium_driver=driver)
+
+res = pypatent.Search('microsoft', get_patent_details=True, web_connection=conn)
+
+print(res)
+```
+
+An example using the `requests` library with a custom user agent:
+```python
+import pypatent
+
+conn = pypatent.WebConnection(use_selenium=False, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36')
+
+res = pypatent.Search('microsoft', get_patent_details=True, web_connection=conn)
+
+print(res)
+```
+
+An example using the `requests` library with default user agent (WebConnection is not necessary here as we are using the defaults)
+```python
+import pypatent
+
+res = pypatent.Search('microsoft', get_patent_details=True)
+
+print(res)
+```
+
+#### New in version 1.1:
+
+This version makes searching and storing patent data easier:
+* Simplified to 2 objects: `Search` and `Patent`
+* A `Search` object searches the USPTO site and can output the results as a DataFrame or list. It can scrape the details of each patent, or just get the patent title and URL. Most users will only need to use this object.
+* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search.
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -0,0 +1 @@
+pytest
diff --git a/pypatent/__init__.py b/pypatent/__init__.py
@@ -4,15 +4,44 @@
 import requests
 import re
 import pandas as pd
+from selenium import webdriver
 
 
-class Constants:
-    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
-    request_header = {'user-agent': user_agent}
+class WebConnection:
+    def __init__(self,
+                 use_selenium: bool = False,
+                 selenium_driver: webdriver = None,
+                 user_agent: str = None,
+                 request_header: dict = None):
+        self.use_selenium = use_selenium
+        self.selenium_driver = selenium_driver
+
+        if user_agent is None:
+            self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
+        else:
+            self.user_agent = user_agent
+        if request_header is None:
+            self.request_header = {'user-agent': self.user_agent}
+        else:
+            self.request_header = request_header
+
+    def get(self, url: str):
+        if self.use_selenium:
+            if self.selenium_driver is None:
+                raise ValueError('WebConnection.selenium_driver must point to a valid Selenium webdriver')
+            else:
+                self.selenium_driver.get(url)
+                return self.selenium_driver.page_source
+        else:
+            return requests.get(url, headers=self.request_header).text
 
 
 class Patent:
-    def __init__(self, title: str, url: str):
+    def __init__(self, title: str, url: str, web_connection: WebConnection = None):
+        if web_connection is not None:
+            self.web_connection = web_connection
+        else:
+            self.web_connection = WebConnection()
         self.title = title
         self.url = url
         self.fetched_details = False
@@ -35,7 +64,7 @@ def __init__(self, title: str, url: str):
 
     def fetch_details(self):
         self.fetched_details = True
-        r = requests.get(self.url, headers=Constants.request_header).text
+        r = self.web_connection.get(self.url)
         s = BeautifulSoup(r, 'html.parser')
         try:
             self.patent_num = s.find(string='United States Patent ').find_next().text.replace('\n', '').strip()
@@ -170,9 +199,10 @@ def __repr__(self):
 
 class Search:
     def __init__(self,
-                 string=None,
-                 results_limit=50,
-                 get_patent_details=True,
+                 string: str = None,
+                 results_limit: int = 50,
+                 get_patent_details: bool = True,
+                 web_connection: WebConnection = None,
                  pn=None,
                  isd=None,
                  ttl=None,
@@ -229,7 +259,11 @@ def __init__(self,
                  ilpd=None,
                  ilfd=None):
         self.get_patent_details = get_patent_details
-        args = {k: str(v).replace(' ', '-') for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit]}
+        if web_connection is not None:
+            self.web_connection = web_connection
+        else:
+            self.web_connection = WebConnection()
+        args = {k: str(v).replace(' ', '-') for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit, web_connection]}
         searchstring = ' AND '.join(['%s/%s' % (key, value) for (key, value) in args.items() if key not in ['results_limit']])
         searchstring = searchstring.replace('string/', '')
         searchstring = searchstring.replace(' ', '+')
@@ -242,7 +276,7 @@ def __init__(self,
         base_url = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50&Query='
 
         url = base_url + searchstring + '&d=PTXT'
-        r = requests.get(url, headers=Constants.request_header).text
+        r = self.web_connection.get(url)
         s = BeautifulSoup(r, 'html.parser')
         total_results = int(s.find(string=re.compile('out of')).find_next().text.strip())
 
@@ -272,7 +306,7 @@ def __init__(self,
         self.patents = patents
 
     def get_patents_from_results_url(self, url: str, limit: int = None) -> list:
-        r = requests.get(url, headers=Constants.request_header).text
+        r = self.web_connection.get(url)
         s = BeautifulSoup(r, 'html.parser')
         patents_raw = s.find_all('a', href=re.compile('netacgi'))
         patents_base_url = 'http://patft.uspto.gov'
@@ -287,7 +321,7 @@ def get_patents_from_results_url(self, url: str, limit: int = None) -> list:
             patent_title = patents_raw_list[patent_num_idx + 1][0]
             patent_title = re.sub(' +', ' ', patent_title)
             patent_link = patents_raw_list[patent_num_idx][1]
-            p = Patent(patent_title, patent_link)
+            p = Patent(patent_title, patent_link, self.web_connection)
             if self.get_patent_details:
                 p.fetch_details()
             patents.append(p)

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 bs4
 requests
-pandas
+pandas
+selenium
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
     long_description = fh.read()
 
 setup(name='pypatent',
-      version='1.1.0',
+      version='1.2.0',
       description='Search and retrieve USPTO patent data',
       url='http://github.com/daneads/pypatent',
       author='Dan Eads',
@@ -13,7 +13,7 @@
       classifiers=['Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Topic :: Internet', 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', 'Programming Language :: Python :: 3'],
       keywords=['patent', 'uspto', 'scrape', 'scraping'],
       packages=['pypatent'],
-      install_requires=['bs4', 'requests', 'pandas'],
+      install_requires=['bs4', 'requests', 'pandas', 'selenium'],
       python_requires='>=3',
       zip_safe=False,
       long_description=long_description,