From 0f7e526f6d1211bdc24f95fd8ac266771c94f3fb Mon Sep 17 00:00:00 2001 From: Kaya Celebi Date: Fri, 26 May 2023 02:35:09 -0400 Subject: [PATCH] Updates to scrape protocol --- README.md | 2 +- src/google_flight_analysis/flight.py | 6 +- src/google_flight_analysis/scrape.py | 129 +++++++++++++++------------ tests/test_class.py | 8 +- 4 files changed, 82 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 1baa1eb..78a0317 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![kcelebi](https://circleci.com/gh/celebi-pkg/flight-analysis.svg?style=svg)](https://circleci.com/gh/celebi-pkg/flight-analysis) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Live on PyPI](https://img.shields.io/badge/PyPI-1.1.0-brightgreen)](https://pypi.org/project/google-flight-analysis/) -[![Live on PyPI](https://img.shields.io/badge/PyPI-1.1.1--alpha.3-brightgreen)](https://test.pypi.org/project/google-flight-analysis/1.1.1a3/) +[![TestPyPI](https://img.shields.io/badge/PyPI-1.1.1--alpha.3-blue)](https://test.pypi.org/project/google-flight-analysis/1.1.1a3/) # Flight Analysis diff --git a/src/google_flight_analysis/flight.py b/src/google_flight_analysis/flight.py index d4d0db1..6bef828 100644 --- a/src/google_flight_analysis/flight.py +++ b/src/google_flight_analysis/flight.py @@ -8,12 +8,12 @@ class Flight: - def __init__(self, dl, *args): + def __init__(self, date, *args): self._id = 1 self._origin = None self._dest = None - self._date = dl - self._dow = datetime.strptime(dl, '%Y-%m-%d').isoweekday() # day of week + self._date = date + self._dow = datetime.strptime(date, '%Y-%m-%d').isoweekday() # day of week self._airline = None self._flight_time = None self._num_stops = None diff --git a/src/google_flight_analysis/scrape.py b/src/google_flight_analysis/scrape.py index f8f7101..1ce14f6 100644 --- a/src/google_flight_analysis/scrape.py +++ b/src/google_flight_analysis/scrape.py @@ -48,21 +48,15 @@ def __init__(self): self._date = None self._data = pd.DataFrame() self._url = None + self._type = None # if date leave and date return, return 2 objects? def __call__(self, *args): - if _#len(args) <= 4: - # base call protocol - self._set_properties(*args) - obj = self.clone(*args) - obj.data = self._data - return obj - else: - # data file being added to new scrape - self._set_properties(*(args[:-1])) - obj = self.clone(*(args[:-1])) - obj.data = args[-1] - return obj + # base call protocol + self._set_properties(*args) + obj = self.clone(*args) + obj.data = self._data + return obj # Ability to combine a going and return trip @@ -134,6 +128,10 @@ def _set_properties(self, *args): self._origin, self._dest, self._date = args + assert len(self._origin) == len(self._dest) == len(self._date), "Issue with array lengths, talk to dev" + self._url = self._make_url() + self._type = 'one-way' + # round-trip elif len(args) == 4: assert len(args[0]) == 3 and type(args[0]) == str, "Issue with arg 0, see docs" @@ -143,18 +141,34 @@ def _set_properties(self, *args): self._origin, self._dest, self._date = args[:2] + (args[2:],) + assert len(self._origin) == len(self._dest) == len(self._date), "Issue with array lengths, talk to dev" + self._url = self._make_url() + self._type = 'round-trip' + # chain-trip - elif len(args) >= 4 and len(args) % 2 == 1: - + elif len(args) >= 3 and len(args) % 3 == 0: + self._origin, self._dest, self._date = [], [], [] + + for i in range(0, len(args), 3): + assert len(args[i]) == 3 and type(args[i]) == str, "Issue with arg {}, see docs".format(i) + assert len(args[i + 1]) == 3 and type(args[i+1]) == str, "Issue with arg {}, see docs".format(i+1) + assert len(args[i + 2] == 10 and type(args[i + 2])) == str, "Issue with arg {}, see docs".format(i+2) + + self._origin += [args[i]] + self._dest += [args[i + 1]] + self._date += [args[i + 2]] + + assert len(self._origin) == len(self._dest) == len(self._date), "Issue with array lengths, talk to dev" + self._url = self._make_url() + self._type = 'chain-trip' + # perfect-chain - elif _: + elif len(args) >= 4 and len(args) % 2 == 1: assert len(args[0]) == 3 and type(args[0]) == str, "Issue with arg 0, see docs" assert len(args[1]) == 10 and type(args[1]) == str, "Issue with arg 1, see docs" - self._origin = [args[0]] - self._dest = [] - self._date = [args[1]] + self._origin, self._dest, self._date = [args[0]], [], [args[1]] for i in range(2, len(args)-1, 2): assert len(args[i]) == 3 and type(args[i]) == str, "Issue with arg {}, see docs".format(i) @@ -167,8 +181,14 @@ def _set_properties(self, *args): assert len(args[-1]) == 3 and type(args[-1]) == str, "Issue with last arg, see docs" self._dest += [args[-1]] + assert len(self._origin) == len(self._dest) == len(self._date), "Issue with array lengths, talk to dev" + self._url = self._make_url() + self._type = 'perfect-chain' + + + else: - raise Error() + raise NotImplementedError() '''( self._origin, self._dest, self._date_leave, self._date_return @@ -185,6 +205,7 @@ def origin(self): @origin.setter def origin(self, x : str) -> None: + assert self._data.shape[0] == 0, "Can't set origin after query has been completed." self._origin = x @property @@ -193,23 +214,17 @@ def dest(self): @dest.setter def dest(self, x : str) -> None: + assert self._data.shape[0] == 0, "Can't set destination after query has been completed." self._dest = x @property - def date_leave(self): - return self._date_leave + def date(self): + return self.date - @date_leave.setter - def date_leave(self, x : str) -> None: - self._date_leave = x - - @property - def date_return(self): - return self._date_return - - @date_return.setter - def date_return(self, x : str) -> None: - self._date_return = x + @date.setter + def date(self, x : str) -> None: + assert self._data.shape[0] == 0, "Can't set date after query has been completed." + self.date = x @property def data(self): @@ -223,41 +238,44 @@ def data(self, x): def url(self): return self._url - @url.setter - def url(self, x): - self._url = x - + @property + def type(self): + return self._type + ''' Scrape the object. Add support for multiple queries, iterative. ''' def _scrape_data(self, driver): + + results = [self._get_results(url, self._data[i], driver) for i, url in enumerate(self._url)] + + self._data = pd.concat(result) - if self._date_return is not None: + '''if self._date_return is not None: leave_result = self._get_results(self._url[0], driver) return_result = self._get_results(self._url[1], driver) self._data = pd.concat([leave_result, return_result], ignore_index = True) return leave_result = self._get_results(self._url, driver) - self._data = leave_result + self._data = leave_result''' - def _make_url(self, leave = True): - if leave: - return 'https://www.google.com/travel/flights?q=Flights%20to%20{dest}%20from%20{org}%20on%20{date}%20oneway'.format( - dest = self._dest, - org = self._origin, - date = self._date_leave - ) - else: - return 'https://www.google.com/travel/flights?q=Flights%20to%20{org}%20from%20{dest}%20on%20{date}%20oneway'.format( - dest = self._dest, - org = self._origin, - date = self._date_return - ) + def _make_url(self): + urls = [] + for i in range(len(self._date)): + urls += [ + 'https://www.google.com/travel/flights?q=Flights%20to%20{org}%20from%20{dest}%20on%20{date}%20oneway'.format( + dest = self._dest[i], + org = self._origin[i], + date = self._date[i] + ) + ] + return urls - def _get_results(self, url, driver): + @staticmethod + def _get_results(url, date, driver): results = None try: results = _Scrape._make_url_request(url, driver) @@ -268,10 +286,11 @@ def _get_results(self, url, driver): ) return -1 - flights = self._clean_results(results) + flights = _Scrape._clean_results(results, date) return Flight.dataframe(flights) - def _clean_results(self, result): + @staticmethod + def _clean_results(result, date): res2 = [x.encode("ascii", "ignore").decode().strip() for x in result] start = res2.index("Sort by:")+1 @@ -286,7 +305,7 @@ def _clean_results(self, result): res3 = res2[start:mid_start] + res2[mid_end:end] matches = [i for i, x in enumerate(res3) if len(x) > 2 and ((x[-2] != '+' and (x.endswith('PM') or x.endswith('AM'))) or x[-2] == '+')][::2] - flights = [Flight(self._date_leave, res3[matches[i]:matches[i+1]]) for i in range(len(matches)-1)] + flights = [Flight(date, res3[matches[i]:matches[i+1]]) for i in range(len(matches)-1)] return flights diff --git a/tests/test_class.py b/tests/test_class.py index 6d6e74a..b34d9f4 100644 --- a/tests/test_class.py +++ b/tests/test_class.py @@ -3,8 +3,8 @@ from pathlib import Path import os -from src.google_flight_analysis.scrape import * -from src.google_flight_analysis.cache import * +from google_flight_analysis.scrape import * +from google_flight_analysis.cache import * ''' Create resilience test: run the code 3 times and check DBs the same @@ -14,10 +14,10 @@ def func_0(): return True res1 = pd.read_csv('tests/test_data/test1.csv') -res1 = Scrape("LGA", "RDU", "2023-05-15", "2023-06-15", res1) +res1 = Scrape("LGA", "RDU", "2023-05-15", "2023-06-15") res2 = pd.read_csv('tests/test_data/test2.csv') -res2 = Scrape("IST", "CDG", "2023-07-15", "2023-07-20", res2) +res2 = Scrape("IST", "CDG", "2023-07-15", "2023-07-20",) os.system('rm tests/test_data/LGA-RDU.csv') os.system('rm tests/test_data/CDG-IST.csv')