From 8af0d85bd52b9a307970dc88df345f1f7e963a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Witkowski?= Date: Fri, 2 Aug 2024 13:00:52 +0200 Subject: [PATCH] Enable FlugzeugMarktDeSpider (#17) Also: optimised checking of duplicates (if offer is already stored in db) by using exists (~0.1ms) instead of plain select (~4ms) --- backend/db.py | 8 +++----- backend/job_fetch_offers.py | 2 +- backend/spiders/FlugzeugMarktDeSpider.py | 13 ++++++++++++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/backend/db.py b/backend/db.py index e30624e..90bf920 100644 --- a/backend/db.py +++ b/backend/db.py @@ -92,13 +92,11 @@ def update_exchange_rate(exchange_rate): def has_offer_url(offer_url): - logger.debug("Starting new database connection") try: - s = select(AircraftOffer.offer_url).where(AircraftOffer.offer_url == offer_url) + query = select(select(AircraftOffer.offer_url).where(AircraftOffer.offer_url == offer_url).exists()) conn = engine.connect() - if conn.execute(s).fetchone(): - return True - return False + result = conn.execute(query).fetchone() + return result is not None and result[0] == True except Exception as e: logger.error(e) logger.error("database error, assuming we don't have this offer already") diff --git a/backend/job_fetch_offers.py b/backend/job_fetch_offers.py index 5ba93e0..7cde5c0 100644 --- a/backend/job_fetch_offers.py +++ b/backend/job_fetch_offers.py @@ -17,7 +17,7 @@ spiders = { SoaringDeSpider.SoaringDeSpider: None, - #FlugzeugMarktDeSpider.FlugzeugMarktDeSpider: None, + FlugzeugMarktDeSpider.FlugzeugMarktDeSpider: None, #PlaneCheckComSpider.PlaneCheckComSpider: None } for spider_cls in spiders.keys(): diff --git a/backend/spiders/FlugzeugMarktDeSpider.py b/backend/spiders/FlugzeugMarktDeSpider.py index d4e9884..f105bca 100644 --- a/backend/spiders/FlugzeugMarktDeSpider.py +++ b/backend/spiders/FlugzeugMarktDeSpider.py @@ -11,7 +11,18 @@ class FlugzeugMarktDeSpider(scrapy.Spider): name = "flugzeugmarkt_de" logger = logging.getLogger(name) - start_urls = [AIRCRAFT_OFFERS_URL] + start_urls = [ + AIRCRAFT_OFFERS_URL, + AIRCRAFT_OFFERS_URL + "/seite-2.html", + AIRCRAFT_OFFERS_URL + "/seite-3.html", + AIRCRAFT_OFFERS_URL + "/seite-4.html", + AIRCRAFT_OFFERS_URL + "/seite-5.html", + AIRCRAFT_OFFERS_URL + "/seite-6.html", + AIRCRAFT_OFFERS_URL + "/seite-7.html", + AIRCRAFT_OFFERS_URL + "/seite-8.html", + AIRCRAFT_OFFERS_URL + "/seite-9.html", + AIRCRAFT_OFFERS_URL + "/seite-10.html" + ] aircraft_type_mapping = { "Motorsegler": "tmg",