Skip to content
This repository has been archived by the owner on Jan 25, 2024. It is now read-only.

Commit

Permalink
♻️ refact(multithread): Create copy of chromedriver for each thread (#…
Browse files Browse the repository at this point in the history
…115)

This fix and issue on docker with "text file busy". I also added a docker test that can be run with the command line:

```bash
docker build -t ecoindex-scrap:114 . && docker run -it --rm -v /tmp/ecoindex-cli:/tmp/ecoindex-cli ecoindex-scrap:114 python tests/docker.py
```
  • Loading branch information
vvatelot authored Jun 19, 2023
1 parent 24f4d4b commit de05595
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 32 deletions.
46 changes: 16 additions & 30 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,47 +1,33 @@
# Build image
FROM python:3.11-slim as requirements-stage
FROM python:3.11-slim

ARG CHROME_VERSION_MAIN=111
ENV CHROME_VERSION_MAIN=${CHROME_VERSION_MAIN}
ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver

WORKDIR /tmp
COPY ./ ./

# Install required deps
RUN apt update && apt install -y unzip wget
RUN pip install poetry
RUN poetry build

# Build requirements.txt file
COPY ./pyproject.toml ./poetry.lock /tmp/
RUN poetry export --output=requirements.txt --without-hashes

# Download chromedriver and chrome
RUN wget "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION_MAIN}" -O /tmp/chrome_version
RUN wget "https://chromedriver.storage.googleapis.com/$(cat /tmp/chrome_version)/chromedriver_linux64.zip" \
&& unzip -o chromedriver_linux64.zip
&& unzip -o chromedriver_linux64.zip && rm chromedriver_linux64.zip \
&& mv chromedriver ${CHROMEDRIVER_PATH} \
&& chmod +x ${CHROMEDRIVER_PATH}
RUN wget "https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_$(cat /tmp/chrome_version)-1_amd64.deb" \
-O google-chrome-stable.deb


# Main image
FROM python:3.11-slim

ARG CHROME_VERSION_MAIN=111
ENV CHROME_VERSION_MAIN=${CHROME_VERSION_MAIN}
ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
RUN apt update && \
apt -y install libpq-dev gcc /tmp/google-chrome-stable.deb && \
rm -rf /var/lib/apt/lists/* && \
rm /tmp/google-chrome-stable.deb

WORKDIR /code
ENV PYTHONPATH "/code"

# Copy requirements.txt, chromedriver, chrome_version, google-chrome-stable.deb from requirements-stage
COPY --from=requirements-stage /tmp/ /tmp/
COPY --from=requirements-stage /tmp/chromedriver /usr/bin/chromedriver
COPY --from=requirements-stage /tmp/chrome_version /tmp/chrome_version
COPY --from=requirements-stage /tmp/google-chrome-stable.deb /tmp/
COPY poetry.lock pyproject.toml ./
RUN pip install poetry && \
poetry export --output requirements.txt --without-hashes && \
pip install -r requirements.txt

# Install google chrome and make chromedriver executable
RUN apt update && apt -y install libpq-dev gcc /tmp/google-chrome-stable.deb
RUN chmod +x /usr/bin/chromedriver

COPY ./ /code/

RUN pip install -r /tmp/requirements.txt
COPY ./ ./
13 changes: 12 additions & 1 deletion ecoindex_scraper/scrap.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from datetime import datetime
from json import loads
from os import remove
from shutil import copyfile
from time import sleep
from typing import Dict, Tuple
from uuid import uuid4

import undetected_chromedriver as uc
from ecoindex.ecoindex import get_ecoindex
Expand Down Expand Up @@ -35,7 +38,6 @@ def __init__(
self.screenshot_uid = screenshot_uid
self.screenshot_gid = screenshot_gid
self.chrome_version_main = chrome_version_main
self.driver_executable_path = driver_executable_path
self.page_load_timeout = page_load_timeout

self.chrome_options = uc.ChromeOptions()
Expand All @@ -49,10 +51,19 @@ def __init__(

self.capbs["goog:loggingPrefs"] = {"performance": "ALL"} # type: ignore

if driver_executable_path:
self.driver_executable_path = f"/tmp/chromedriver_{uuid4()}"
copyfile(driver_executable_path, self.driver_executable_path)
else:
self.driver_executable_path = None

def __del__(self):
if hasattr(self, "driver"):
self.driver.quit()

if self.driver_executable_path:
remove(self.driver_executable_path)

def init_chromedriver(self):
self.driver = uc.Chrome(
options=self.chrome_options,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ include = [

[tool.poetry.dependencies]
python = "^3.10"
ecoindex = {version = "^5.4.2", allow-prereleases = true}
ecoindex = "^5.4.2"
undetected-chromedriver = "3.4.7"
Pillow = "^9.2.0"
selenium = "4.9"
Expand Down
42 changes: 42 additions & 0 deletions tests/docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed

from ecoindex_scraper.scrap import EcoindexScraper


def run_page_analysis(url):
try:
ecoindex = asyncio.run(
EcoindexScraper(
url=url,
driver_executable_path="/usr/bin/chromedriver",
chrome_version_main=114,
)
.init_chromedriver()
.get_page_analysis()
)

return ecoindex

except Exception as e:
print(e)


with ThreadPoolExecutor(max_workers=8) as executor:
future_to_analysis = {}

url = "https://www.ecoindex.fr"

for i in range(10):
future_to_analysis[
executor.submit(
run_page_analysis,
url,
)
] = url

for future in as_completed(future_to_analysis):
try:
print(future.result())
except Exception as e:
print(e)

0 comments on commit de05595

Please sign in to comment.