Skip to content

Commit

Permalink
Backend: more consistent api urls, better ordering of spider pipeline…
Browse files Browse the repository at this point in the history
…s, added integration tests for pipelines and db
  • Loading branch information
lwitkowski committed Aug 3, 2024
1 parent af1fbad commit 03c1ff3
Show file tree
Hide file tree
Showing 24 changed files with 264 additions and 89 deletions.
3 changes: 2 additions & 1 deletion azure/setup_azure_infra.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ az containerapp create \
--environment $ENV_NAME \
--registry-server $ACR \
--image $ACR/aerooffers-api:$DOCKER_IMAGE_TAG \
--env-vars DB_HOST="???" DB_PORT="5432" DB_NAME="???" DB_USER="???" DB_PW="???" \
--secrets "db-user=$DB_USER" "db-password=$DB_PASS" \
--env-vars "DB_HOST=$DB_HOST" "DB_PORT=$DB_PORT" "DB_NAME=$DB_NAME" "DB_USER=secretref:db-user" "DB_PW=secretref:db-password" \
--target-port 80 \
--ingress internal \
--transport tcp \
Expand Down
14 changes: 7 additions & 7 deletions backend/api/flask_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})

@app.route("/api/models")
@headers({'Cache-Control':'public, max-age=360'})
def aircraft_models():
return jsonify(classifier.get_all_models())

@app.route('/api/offers')
def offers():
return jsonify(db.get_offers_dict(aircraft_type=request.args.get('aircraft_type'),
offset=request.args.get('offset'),
limit=request.args.get('limit')))

@app.route("/api/model/<manufacturer>/<model>")
@app.route("/api/offers/<manufacturer>/<model>")
def model_information(manufacturer, model):
"""
Returns statistics for a specific manufacturer and model
Expand All @@ -26,10 +31,5 @@ def model_information(manufacturer, model):
manufacturer_info["offers"] = db.get_offers_for_model(manufacturer, model)
return jsonify(manufacturer_info)

@app.route("/api/models")
@headers({'Cache-Control':'public, max-age=360'})
def aircraft_models():
return jsonify(classifier.get_all_models())

if __name__ == '__main__':
app.run(host='127.0.0.1', port=8080, debug=True)
app.run(host='127.0.0.1', port=8080, debug=False)
29 changes: 15 additions & 14 deletions backend/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@
from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import select
from sqlalchemy.sql import text
from sqlalchemy.types import Date, DateTime, Unicode, Numeric, Integer

from my_logging import *
from settings import DB_NAME, DB_USER, DB_PW, DB_HOST, DB_PORT
from settings import DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PW

logger = logging.getLogger('db')

Base = declarative_base()


class AircraftOffer(Base):
__tablename__ = "aircraft_offer"

Expand Down Expand Up @@ -50,8 +49,7 @@ def as_dict(self):
"location": self.location,
"aircraft_type": self.aircraft_type,
"manufacturer": self.manufacturer,
"model": self.model,
"classified": self.classified
"model": self.model
}

class ExchangeRate(Base):
Expand All @@ -66,10 +64,11 @@ class ExchangeRate(Base):
engine = create_engine('postgresql+psycopg2://{0}:{1}@{2}:{3}/{4}'.format(DB_USER, DB_PW, DB_HOST, DB_PORT, DB_NAME))
Session = sessionmaker(bind=engine)


def create_tables():
Base.metadata.create_all(engine)

def truncate_offers():
session = Session()
session.execute(text("TRUNCATE aircraft_offer"))
session.commit()
session.close()

def store_entity(entity):
session = Session()
Expand All @@ -91,16 +90,18 @@ def update_exchange_rate(exchange_rate):
session.commit()


def has_offer_url(offer_url):
def offer_url_exists(offer_url):
session = Session()
try:
query = select(select(AircraftOffer.offer_url).where(AircraftOffer.offer_url == offer_url).exists())
conn = engine.connect()
result = conn.execute(query).fetchone()
return result is not None and result[0] == True
q = session.query(AircraftOffer).where(AircraftOffer.offer_url == offer_url).exists()
result = session.query(q).one()
return result is not None and result[0]
except Exception as e:
logger.error(e)
logger.error("database error, assuming we don't have this offer already")
return False
finally:
session.close()

def get_exchange_rates_as_dict(session):
all_exchange_rates = session.query(ExchangeRate).all()
Expand Down
59 changes: 31 additions & 28 deletions backend/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,42 @@
import datetime

from scrapy.exceptions import DropItem
from price_parser import Price

import db
from my_logging import *
from spiders import SoaringDeSpider, FlugzeugMarktDeSpider, PlaneCheckComSpider
from exchange_rates import get_currency_code

logger = logging.getLogger('pipeline')


class DuplicateDetection(object):

def process_item(self, item, spider):
logger.debug("Detecting duplicates for item %s", str(item))
if spider.name in [SoaringDeSpider.SoaringDeSpider.name,
FlugzeugMarktDeSpider.FlugzeugMarktDeSpider.name,
PlaneCheckComSpider.PlaneCheckComSpider.name]:
has_offer = db.has_offer_url(item["offer_url"])
if has_offer:
logger.debug("Offer URL matches, Offer is already stored. dropping item")
raise DropItem("Offer already stored")
elif item['price'].amount is None:
raise DropItem("Offer has no price")
else:
logger.warning("Can't handle this spider for duplicate detection: %s", spider.name)
def process_item(self, item, _):
if db.offer_url_exists(item["offer_url"]):
logger.debug("Offer already exists in DB, url={0}".format(item["offer_url"]))
raise DropItem("Offer already exists in DB, url={0}".format(item["offer_url"]))
return item


class FilterUnreasonablePrices(object):

def process_item(self, item, _):
logger.debug("Filtering Prices of 1 and below")
if item["price"] and item["price"].amount <= 1:
raise DropItem("Offer has price of 1 (or below)")
price = Price.fromstring(item['raw_price'])
if price is None or price.amount is None:
msg = "Offer has no valid price, raw_price='{0}' url={1}".format(item['raw_price'].strip(), item["offer_url"])
logger.info(msg)
raise DropItem(msg)

if price.amount <= 1:
msg = "Offer has unreasonable price smaller than 1, price={0}, url={1}".format(price.amount_text, item["offer_url"])
logger.info(msg)
raise DropItem(msg)

if price.amount > 500_000:
msg = "Offer has unreasonable price higher than 500_000, price={0}, url={1} ".format(price.amount_text, item["offer_url"])
logger.info(msg)
raise DropItem(msg)

item['price'] = price
return item


Expand All @@ -42,26 +45,26 @@ class FilterSearchAndCharterOffers(object):
charter_offer_terms = ["charter", "for rent"]

def process_item(self, item, _):
logger.debug("Filtering Searches for Aircraft Offers")
for search_offer_term in self.search_offer_terms:
if search_offer_term in item["title"].lower():
logger.info("dropping search offer: " + str(item["title"]))
logger.info("Dropping search offer, title='%s' url=%s", item["title"], item["offer_url"])
raise DropItem("Dropping Search offer")
for charter_term in self.charter_offer_terms:
if charter_term in item["title"].lower():
logger.info("dropping charter offer: " + str(item["title"]))
logger.info("Dropping charter offer, title='%s' url=%s", item["title"], item["offer_url"])
raise DropItem("Dropping Charter Offer")
return item


class StoragePipeline(object):

def process_item(self, item, spider):
spider.crawler.stats.inc_value('items_stored')
logger.debug("Storing offer %s", str(item))
logging.debug("Fetching currency code")
def process_item(self, item, spider=None):
if spider is not None:
spider.crawler.stats.inc_value('items_stored')

currency_code = get_currency_code(item["price"])
logging.debug("currency code is {0}".format(currency_code))
logger.debug("Storing offer title='%s', url=%s, currency_code=%s", item["title"], item["offer_url"], currency_code)

db.store_offer(db.AircraftOffer(
title=item["title"],
creation_datetime=datetime.datetime.now(),
Expand All @@ -71,7 +74,7 @@ def process_item(self, item, spider):
currency_code=currency_code,
location=item["location"],
offer_url=item["offer_url"],
spider=spider.name,
spider=spider.name if spider is not None else "unknown",
hours=item["hours"],
starts=item["starts"],
detail_text=item["detail_text"],
Expand Down
5 changes: 1 addition & 4 deletions backend/run_tests.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
export PYTHONPATH=$PYTHONPATH':./'

python3 -m unittest -f
python3 -m unittest -v

if [[ $? -ne 0 ]]; then
docker rm -f test-db
exit 1
else
docker rm -f test-db
fi
6 changes: 3 additions & 3 deletions backend/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

# scrapy pipeline components config, do not delete this
ITEM_PIPELINES = {
'pipelines.DuplicateDetection': 100,
'pipelines.FilterUnreasonablePrices': 200,
'pipelines.FilterSearchAndCharterOffers': 300,
'pipelines.FilterUnreasonablePrices': 100,
'pipelines.FilterSearchAndCharterOffers': 200,
'pipelines.DuplicateDetection': 300,
'pipelines.StoragePipeline': 400,
}
6 changes: 2 additions & 4 deletions backend/spiders/FlugzeugMarktDeSpider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import scrapy
import datetime
import re
from price_parser import Price
from my_logging import *

BASE_URL = "https://www.flugzeugmarkt.de/"
Expand Down Expand Up @@ -48,7 +47,6 @@ def _extract_number_from_cell(self, name, response):
def parse_detail_page(self, response):
date = response.xpath("//tr/td[contains(.,'Eingestellt')]/../td[@class='value']/text()").extract_first()
price_str = response.css('div.buy-it-now div.price::text').extract_first()
parsed_price = Price.fromstring(price_str)
location = response.xpath("//tr/td[contains(.,'Standort')]/../td[@class='value']/text()").extract_first()
hours = self._extract_number_from_cell("Gesamtzeit", response)
starts = self._extract_number_from_cell("Landungen", response)
Expand All @@ -63,10 +61,10 @@ def parse_detail_page(self, response):
self.logger.info(
"Couldn't determine aircraft type for offer: {0} with url: {1}".format(title, response.url))
self.logger.debug("yielding title %s", title)
yield {
yield { # TODO introduce data class
'title': title,
'date': datetime.datetime.strptime(date, "%d.%m.%Y").date(),
'price': parsed_price,
'raw_price': price_str,
'offer_url': response.url,
'location': location,
'aircraft_type': aircraft_type,
Expand Down
6 changes: 2 additions & 4 deletions backend/spiders/PlaneCheckComSpider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import scrapy
import datetime
from price_parser import Price
from my_logging import *

BASE_URL = "https://www.planecheck.com/"
Expand Down Expand Up @@ -38,14 +37,13 @@ def parse_detail_page(self, response):
logging.info("price with VAT should be: {0}".format(price_str))
else:
price_str = response.xpath("//td[contains(.,'Price')]/../td[2][contains(.,',')]/b/text()").extract_first()
parsed_price = Price.fromstring(price_str)
location = response.xpath("//td/b[contains(.,'Country')]/../../td[2]/text()").extract_first()
yield {
yield { # TODO introduce data class
'offer_url': response.url,
'title': title,
'aircraft_type': 'airplane',
'date': datetime.datetime.strptime(date, "%d-%m-%Y").date(), # last updated value
'price': parsed_price,
'raw_price': price_str,
'detail_text': response.text,
'location': location, # TODO currently only the country is extracted,
'hours': -1,
Expand Down
6 changes: 2 additions & 4 deletions backend/spiders/SoaringDeSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from scrapy.spidermiddlewares.httperror import HttpError
import datetime
import re
from price_parser import Price
from my_logging import *

GLIDER_OFFERS_URL = "https://soaring.de/osclass/index.php?page=search&sCategory=118"
Expand Down Expand Up @@ -52,7 +51,6 @@ def errback(self, failure):

def parse_detail_page(self, response):
price_str = response.css('#item-content .item-header li::text').extract()[1]
parsed_price = Price.fromstring(price_str)
date_str = response.css('#item-content .item-header li::text').extract()[3]
date_str = date_str.replace('Veröffentlichungsdatum:', '').strip()
date_obj = datetime.datetime.strptime(date_str, "%d/%m/%Y").date()
Expand All @@ -68,9 +66,9 @@ def parse_detail_page(self, response):
if 'Gesamtstarts' in aircraft_details:
starts = self._extract_first_number(aircraft_details)

yield {
yield { # TODO introduce data class
'title': response.css('#item-content .title strong::text').extract_first(),
'price': parsed_price,
'raw_price': price_str,
'offer_url': response.url,
'location': location,
'date': date_obj,
Expand Down
5 changes: 4 additions & 1 deletion backend/tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
ddt==1.7.2
ddt==1.7.2
psycopg==3.2.1
pytest==8.3.2
testcontainers[postgres]==4.7.2
2 changes: 1 addition & 1 deletion backend/tests/test_FlugzeugMarktDeSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_parse_detail_page(self):
fake_response_from_file('samples/flugzeugmarkt_de_offer.html')))
self.assertIsNotNone(item["title"])
self.assertEqual(item["date"], datetime.datetime.strptime("08.10.2019", "%d.%m.%Y").date())
self.assertIsNotNone(item["price"])
self.assertEqual("250.000 $", item["raw_price"])
self.assertEqual(1492, item["hours"])
self.assertTrue("IFR Approved" in item["detail_text"])
self.assertEqual("airplane", item["aircraft_type"])
7 changes: 2 additions & 5 deletions backend/tests/test_PlaneCheckComSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ def test_parse_detail_page(self):
self.assertIsNotNone(item["title"])
self.assertEqual("Beech 95 Travel Air D95A", item["title"])
self.assertEqual(item["date"], datetime.datetime.strptime("31.12.2019", "%d.%m.%Y").date())
self.assertIsNotNone(item["price"])
self.assertEqual("92,500", item["price"].amount_text)
self.assertEqual("€", item["price"].currency)
self.assertEqual(item["raw_price"], "€\xa092,500")
self.assertTrue(len(item["detail_text"]) > 0)
self.assertTrue("Switzerland" in item["location"])
self.assertTrue(len(item["offer_url"]) > 0)
Expand All @@ -34,5 +32,4 @@ def test_parse_detail_page_price_vat_included(self):
fake_response_from_file('samples/planecheck_com_offer_piper.html', encoding='iso-8859-1')))
self.assertIsNotNone(item["title"])
self.assertEqual("Piper PA-34-220T Seneca V", item["title"])
self.assertEqual(item["price"].currency, "$")
self.assertEqual(item["price"].amount_text, "743,750")
self.assertIsNotNone(item["raw_price"])
5 changes: 2 additions & 3 deletions backend/tests/test_SoaringDeSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_parse_detail_page(self):
self.assertIsNotNone(item["date"])
self.assertTrue(isinstance(item["date"], date))
self.assertIsNotNone(item["title"])
self.assertIsNotNone(item["price"])
self.assertEqual("25.000,00 Euro € ", item["raw_price"])
self.assertIsNotNone(item["offer_url"])
self.assertIsNotNone(item["location"])
self.assertEqual(item["hours"], str(2522))
Expand All @@ -35,8 +35,7 @@ def test_parse_detail_page_for_tmg(self):
self.assertEqual("Dimona H36", item["title"])
self.assertEqual("2880", item["hours"])
self.assertEqual("5672", item["starts"])
self.assertEqual("22.000,00", item["price"].amount_text)
self.assertEqual("€", item["price"].currency)
self.assertEqual("22.000,00 Euro €\n ", item["raw_price"])

def test_parse_detail_page_for_ls3(self):
item = next(self.spider.parse_detail_page(
Expand Down
17 changes: 17 additions & 0 deletions backend/tests/test__testcontainers_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# This module's name (double __) makes python unittest runner to execute this BEFORE db module (or any other modules depending on 'db') otherwise db module
# will NOT connect properly to Postgres instance initialized here
import os

from pathlib import Path
from testcontainers.postgres import PostgresContainer

db_migration_scripts_location = Path(__file__).parent.parent.parent / "db" / "migrations"
postgres = PostgresContainer("postgres:16-alpine")
postgres.with_volume_mapping(host=str(db_migration_scripts_location), container=f"/docker-entrypoint-initdb.d/")
postgres.start()

os.environ["DB_HOST"] = postgres.get_container_host_ip()
os.environ["DB_PORT"] = postgres.get_exposed_port(5432)
os.environ["DB_NAME"] = postgres.dbname
os.environ["DB_USER"] = postgres.username
os.environ["DB_PW"] = postgres.password
Loading

0 comments on commit 03c1ff3

Please sign in to comment.