Skip to content

Commit

Permalink
Adding project
Browse files Browse the repository at this point in the history
  • Loading branch information
AnthonyRAFFY committed Aug 14, 2024
0 parents commit 2dbddc0
Show file tree
Hide file tree
Showing 11 changed files with 832 additions and 0 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Create and publish a Docker image to Github Packages

on:
push:
tags:
- v*

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
attestations: write
id-token: write
steps:
- name: Set up QEMU
uses: docker/setup-qemu-action@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Checkout repository
uses: actions/checkout@v4

- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

- name: Build and push Docker image
id: push
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
push: true
cache-from: type=gha
cache-to: type=gha,mode=max
platforms: linux/amd64
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__pycache__
*.pyc
.cache
.venv
45 changes: 45 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
FROM python:3.12.4-slim-bookworm

ARG TARGETPLATFORM

ENV POETRY_VERSION=1.8.3
ENV POETRY_HOME=/opt/poetry
ENV POETRY_VENV=/opt/poetry-venv
ENV POETRY_CACHE_DIR=/opt/.cache

RUN apt-get update
RUN apt-get install -qq -y --fix-missing --no-install-recommends \
build-essential \
gcc \
openssl \
libffi-dev \
libssl-dev \
pkg-config \
curl

RUN if [ "$TARGETPLATFORM" = "linux/arm/v7" ] ; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sed 's#/proc/self/exe#\/bin\/sh#g' | sh -s -- -y && \
. $HOME/.cargo/env; \
fi

ENV PATH="/root/.cargo/bin:${PATH}"

RUN python3 -m venv $POETRY_VENV \
&& $POETRY_VENV/bin/pip install -U pip setuptools \
&& $POETRY_VENV/bin/pip install poetry==${POETRY_VERSION}

ENV PATH="${PATH}:${POETRY_VENV}/bin"

WORKDIR /app/

COPY ./pyproject.toml ./poetry.lock* /app/

ARG INSTALL_DEV=false
RUN sh -c "if [ $INSTALL_DEV = true ] ; then poetry install --no-root ; else poetry install --no-root --only main ; fi"

ENV PYTHONPATH "/app/scrappey_proxy"

COPY ./scrappey_proxy /app/scrappey_proxy

EXPOSE 8191

CMD ["poetry", "run", "gunicorn", "main:app", "--bind", "0.0.0.0:8191", "--log-level", "debug", "--timeout", "600"]
382 changes: 382 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[tool.poetry]
name = "scrappey_proxy"
version = "0.1.0"
description = "Flaresolverr substitute using Scrappey.com"
authors = ["Anthony RAFFY <anthony.raffy38@gmail.com>"]
readme = "README.md"

[tool.poetry.dependencies]
python = "3.12.4"
flask = "^3.0.3"
scrappeycom = "^0.3.8"
gunicorn = "^22.0.0"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Empty file added scrappey_proxy/__init__.py
Empty file.
107 changes: 107 additions & 0 deletions scrappey_proxy/flaresolverr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import logging
from typing import Callable

from scrappey_proxy import utils

STATUS_OK = "ok"
STATUS_ERROR = "error"

logger = logging.getLogger("gunicorn.error")


class ChallengeResolutionResultT:
url: str | None = None
status: int | None = None
headers: dict | None = None
response: str | None = None
cookies: list | None = None
userAgent: str | None = None

def __init__(self, _dict):
self.__dict__.update(_dict)


class ChallengeResolutionT:
status: str | None = None
message: str | None = None
result: ChallengeResolutionResultT | None = None

def __init__(self, _dict):
self.__dict__.update(_dict)
if self.result is not None:
self.result = ChallengeResolutionResultT(self.result)


class V1RequestBase(object):
cmd: str | None = None
cookies: list | None = None
maxTimeout: int | None = None
proxy: dict | None = None
session: str | None = None
session_ttl_minutes: int | None = None
headers: list | None = None # deprecated v2.0.0, not used
userAgent: str | None = None # deprecated v2.0.0, not used
# V1Request
url: str | None = None
postData: str | None = None
returnOnlyCookies: bool | None = None
download: bool | None = None # deprecated v2.0.0, not used
returnRawHtml: bool | None = None # deprecated v2.0.0, not used

def __init__(self, _dict):
self.__dict__.update(_dict)


class V1ResponseBase(object):
# V1ResponseBase
status: str | None = None
message: str | None = None
session: str | None = None
sessions: list[str] | None = None
startTimestamp: int | None = None
endTimestamp: int | None = None
version: str | None = None
# V1ResponseSolution
solution: ChallengeResolutionResultT | None = None
# hidden vars
__error_500__: bool = False

def __init__(self, _dict):
self.__dict__.update(_dict)
if self.solution is not None:
self.solution = ChallengeResolutionResultT(self.solution)


def controller_v1_logic(
req: V1RequestBase, get_handler: Callable[[V1RequestBase], V1ResponseBase]
) -> V1ResponseBase:
if req.cmd is None:
raise Exception("Request parameter 'cmd' is mandatory.")

if req.maxTimeout is None or int(req.maxTimeout) < 1:
req.maxTimeout = 60000

res: V1ResponseBase
if req.cmd == "request.get":
res = get_handler(req)
else:
raise Exception(f"Request parameter 'cmd' = '{req.cmd}' is invalid.")

return res


def controller_v1_handler(
req: V1RequestBase, get_handler: Callable[[V1RequestBase], V1ResponseBase]
):
res: V1ResponseBase
logger.info(f"Incoming request => POST /v1 body: {utils.object_to_dict(req)}")
try:
res = controller_v1_logic(req, get_handler)
except Exception as e:
res = V1ResponseBase({})
res.__error_500__ = True
res.status = STATUS_ERROR
res.message = f"Error: {e}"

res.version = "3.0.0"
return res
114 changes: 114 additions & 0 deletions scrappey_proxy/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import logging
import os
from typing import Dict, List

import requests
from flask import Flask, request

from scrappey_proxy import utils
from scrappey_proxy.flaresolverr import (
STATUS_OK,
ChallengeResolutionResultT,
ChallengeResolutionT,
V1RequestBase,
V1ResponseBase,
controller_v1_handler,
)
from scrappey_proxy.scrappey import ScrappeyResponse, get_scrappey

logger = logging.getLogger("gunicorn.error")

PROXY_USERNAME = os.environ.get("PROXY_USERNAME", "")
PROXY_PASSWORD = os.environ.get("PROXY_PASSWORD", "")
PROXY_INTERNAL_IP = os.environ["PROXY_INTERNAL_IP"]
PROXY_EXTERNAL_IP = os.environ["PROXY_EXTERNAL_IP"]
PROXY_INTERNAL_PORT = os.environ["PROXY_INTERNAL_PORT"]
PROXY_EXTERNAL_PORT = os.environ["PROXY_EXTERNAL_PORT"]

saved_cookies = []
saved_headers = {}

proxy = f"{PROXY_USERNAME}:{PROXY_PASSWORD}@{PROXY_INTERNAL_IP}:{PROXY_INTERNAL_PORT}"
proxies = {"http": proxy, "https": proxy}

app = Flask(__name__)


def save_cookies(cookies: List[Dict[str, str]]):
global saved_cookies
saved_cookies = cookies


def save_user_agent(user_agent: str):
global saved_headers
saved_headers["User-Agent"] = user_agent


def get_sendable_cookies():
sendable_cookies = {}
for cookie in saved_cookies:
sendable_cookies[cookie["name"]] = cookie["value"]
return sendable_cookies


def cmd_request_get(req: V1RequestBase) -> V1ResponseBase:
challenge_res_result = ChallengeResolutionResultT({})
challenge_res_result.url = req.url
challenge_res_result.status = 200

challenge_res = ChallengeResolutionT({})
challenge_res.status = STATUS_OK

if not req.url:
raise Exception("Request URL should be present")

logger.info(f"Simple request to {req.url} to check for cloudflare")
logger.debug(f"Using {proxies} as local proxy")
basic_req = requests.get(
req.url, cookies=get_sendable_cookies(), headers=saved_headers, proxies=proxies
)

if utils.detect_cloudflare(basic_req):
logger.info("Detected Cloudflare")
scrappey_res: ScrappeyResponse = get_scrappey(req)
challenge_res_result.cookies = scrappey_res.cookies
challenge_res.message = "Challenge solved!"
challenge_res_result.headers = {}
challenge_res_result.response = scrappey_res.response
challenge_res_result.userAgent = scrappey_res.user_agent
save_cookies(scrappey_res.cookies)
save_user_agent(scrappey_res.user_agent)
else:
logger.info("Cloudflare not detected or cf_clearance cookie still valid")
challenge_res.message = (
"Cloudflare not detected or cf_clearance cookie still valid!"
)
logger.debug(basic_req.text)
challenge_res_result.headers = {}
challenge_res_result.cookies = saved_cookies
challenge_res_result.response = basic_req.text
challenge_res_result.userAgent = (
saved_headers["User-Agent"] if "User-Agent" in saved_headers else ""
)

challenge_res.result = challenge_res_result

res = V1ResponseBase({})
res.status = challenge_res.status
res.message = challenge_res.message
res.solution = challenge_res.result
return res


@app.post("/v1")
def controller_v1_endpoint():
req = V1RequestBase(request.json)
res = controller_v1_handler(req, cmd_request_get)

return utils.object_to_dict(res)


if __name__ != "__main__":
gunicorn_logger = logging.getLogger("gunicorn.error")
app.logger.handlers = gunicorn_logger.handlers
app.logger.setLevel(gunicorn_logger.level)
49 changes: 49 additions & 0 deletions scrappey_proxy/scrappey.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import logging
import os
from dataclasses import dataclass
from typing import Dict, List

from scrappeycom.scrappey import Scrappey

from scrappey_proxy.flaresolverr import V1RequestBase

logger = logging.getLogger("gunicorn.error")

PROXY_USERNAME = os.environ.get("PROXY_USERNAME", "")
PROXY_PASSWORD = os.environ.get("PROXY_PASSWORD", "")
PROXY_EXTERNAL_IP = os.environ["PROXY_EXTERNAL_IP"]
PROXY_EXTERNAL_PORT = os.environ["PROXY_EXTERNAL_PORT"]

proxy_url = f"http://{PROXY_USERNAME}:{PROXY_PASSWORD}@{PROXY_EXTERNAL_IP}:{PROXY_EXTERNAL_PORT}"


@dataclass
class ScrappeyResponse:
response: str
status_code: int
cookies: List[Dict[str, str]]
user_agent: str


scrappey = Scrappey(os.environ["SCRAPPEY_API_KEY"])


# Function which takes a request and forwards it to scrappey
def get_scrappey(request: V1RequestBase):
logger.info(f"Calling scrappey for URL : {request.url}")
logger.debug(f"Using {proxy_url} as scrappey's proxy")

get_request_result = scrappey.get({"url": request.url, "proxy": proxy_url})

if (
"solution" in get_request_result
and "response" in get_request_result["solution"]
):
return ScrappeyResponse(
get_request_result["solution"]["response"],
200,
get_request_result["solution"]["cookies"],
get_request_result["solution"]["userAgent"],
)
else:
return ScrappeyResponse("", 500, [], "")
Loading

0 comments on commit 2dbddc0

Please sign in to comment.