-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
fb54b7c
commit 2185c12
Showing
12 changed files
with
624 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
name: Code style | ||
on: [push, pull_request] | ||
jobs: | ||
code-style: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Check out the repo | ||
uses: actions/checkout@v4 | ||
- name: Check formatting | ||
uses: chartboost/ruff-action@v1 | ||
with: | ||
args: 'format --check' | ||
- name: Check coding style | ||
uses: chartboost/ruff-action@v1 | ||
with: | ||
args: 'check --select F,E,W,I,PLW' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
name: Build and Publish Docker Image | ||
|
||
on: | ||
push: | ||
# branches: | ||
# - main | ||
# - dev | ||
tags: | ||
- 'v*.*.*' # Matches version tags like v1.0.0 | ||
workflow_dispatch: | ||
|
||
env: | ||
DOCKER_HUB_NAMESPACE: pdfix | ||
DOCKER_HUB_REPOSITORY: autotag-textract | ||
|
||
jobs: | ||
push_to_registry: | ||
name: Push Docker image to Docker Hub | ||
runs-on: ubuntu-latest | ||
permissions: | ||
packages: write | ||
contents: read | ||
attestations: write | ||
id-token: write | ||
steps: | ||
- name: Check out the repo | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up QEMU | ||
uses: docker/setup-qemu-action@v3 | ||
|
||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v3 | ||
|
||
- name: Log in to Docker Hub | ||
uses: docker/login-action@v2 | ||
with: | ||
username: ${{ secrets.DOCKER_USERNAME }} | ||
password: ${{ secrets.DOCKER_PASSWORD }} | ||
|
||
- name: Set Docker tag | ||
id: vars | ||
run: | | ||
if [[ $GITHUB_EVENT_NAME == 'workflow_dispatch' || $GITHUB_REF == refs/heads/main ]]; then | ||
echo "tag=latest" >> $GITHUB_ENV | ||
else | ||
echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV | ||
fi | ||
- name: Update config.json version | ||
run: chmod +x update_version.sh && ./update_version.sh ${{ env.tag }} | ||
|
||
- name: Extract metadata (tags, labels) for Docker | ||
id: meta | ||
uses: docker/metadata-action@v4 | ||
with: | ||
images: ${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }} | ||
tags: ${{ env.tag }} | ||
|
||
- name: Run tests | ||
run: chmod +x test.sh && ./test.sh | ||
|
||
- name: Build and push Docker image | ||
id: push | ||
uses: docker/build-push-action@v6 | ||
with: | ||
platforms: linux/amd64 ,linux/arm64 | ||
context: . | ||
file: ./Dockerfile | ||
push: true | ||
tags: ${{ steps.meta.outputs.tags }} | ||
labels: ${{ steps.meta.outputs.labels }} | ||
|
||
# - name: Generate artifact attestation | ||
# uses: actions/attest-build-provenance@v1 | ||
# with: | ||
# subject-name: index.docker.io/${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}:${{ env.tag }} | ||
# subject-digest: ${{ steps.push.outputs.digest }} | ||
# push-to-registry: true | ||
|
||
- name: Docker Hub Description | ||
uses: peter-evans/dockerhub-description@v4 | ||
with: | ||
username: ${{ secrets.DOCKER_USERNAME }} | ||
password: ${{ secrets.DOCKER_PASSWORD }} | ||
repository: ${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }} | ||
|
||
- name: Upload to FTP | ||
run: | | ||
curl -T config.json ftp.pdfix.net/update-service/v1/actions/${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}/config.json --user "${{ secrets.FTP_USERNAME }}:${{ secrets.FTP_PASSWORD }}" --ftp-create-dirs | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
.vscode/ | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# Use the official Debian slim image as a base | ||
FROM debian:stable-slim | ||
|
||
# Install dependencies | ||
RUN apt-get update && \ | ||
apt-get install -y \ | ||
python3 \ | ||
python3-pip \ | ||
python3-venv \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
WORKDIR /usr/autotag/ | ||
|
||
ENV VIRTUAL_ENV=venv | ||
|
||
# Create a virtual environment and install dependencies | ||
RUN python3 -m venv venv | ||
ENV PATH="$VIRTUAL_ENV/bin:$PATH" | ||
|
||
# Copy and install dependencies into the container | ||
COPY requirements.txt /usr/autotag/ | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Copy sources and resources | ||
COPY config.json /usr/autotag/ | ||
COPY src/ /usr/autotag/src/ | ||
|
||
ENTRYPOINT ["/usr/lang-detect/venv/bin/python3", "/usr/autotag/src/main.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,54 @@ | ||
# action-autotag-textract-docker | ||
Autotag PDF documents using AWS Textract Layout Model in Docker | ||
# Autotag Textract | ||
|
||
A Docker image that automatically tags a PDF file using AWS Textract. | ||
|
||
## Table of Contents | ||
|
||
- [Autotag Textract](#autotag-textract) | ||
- [Table of Contents](#table-of-contents) | ||
- [Getting Started](#getting-started) | ||
- [Run using Command Line Interface](#run-using-command-line-interface) | ||
- [Run OCR using REST API](#run-ocr-using-rest-api) | ||
- [Exporting Configuration for Integration](#exporting-configuration-for-integration) | ||
- [License](#license) | ||
- [Help \& Support](#help--support) | ||
|
||
## Getting Started | ||
|
||
To use this Docker application, you'll need to have Docker installed on your system. If Docker is not installed, please follow the instructions on the [official Docker website](https://docs.docker.com/get-docker/) to install it. | ||
|
||
|
||
## Run using Command Line Interface | ||
|
||
To run docker container as CLI you should share the folder with PDF to process using `-i` parameter. In this example it's current folder. | ||
|
||
```bash | ||
docker run -v $(pwd):/data -w /data --rm pdfix/autotag-textract:latest autotag -i input.pdf -o output.pdf | ||
``` | ||
|
||
The first run will pull the docker image, which may take some time. Make your own image for more advanced use. | ||
|
||
For more detailed information about the available command-line arguments, you can run the following command: | ||
|
||
```bash | ||
docker run --rm pdfix/autotag-textract:latest --help | ||
``` | ||
|
||
## Run OCR using REST API | ||
Comming soon. Please contact us. | ||
|
||
### Exporting Configuration for Integration | ||
To export the configuration JSON file, use the following command: | ||
```bash | ||
docker run -v $(pwd):/data -w /data --rm pdfix/autotag-textract:latest config -o config.json | ||
``` | ||
|
||
## License | ||
- PDFix license https://pdfix.net/terms | ||
- AWS Textract | ||
|
||
The trial version of the PDFix SDK may apply a watermark on the page and redact random parts of the PDF including the scanned image in the background. Contact us to get an evaluation or production license. | ||
|
||
## Help & Support | ||
To obtain a PDFix SDK license or report an issue please contact us at support@pdfix.net. | ||
For more information visit https://pdfix.net |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
{ | ||
"version": "v0.0.0", | ||
"actions": [ | ||
{ | ||
"name": "Autotag Textract", | ||
"desc": "Automatically tag PDF using AWS Textract", | ||
"version": "v0.0.0", | ||
"icon": "autotag_textract", | ||
"category": "Metadata", | ||
"program": "docker run --platform -v ${working_directory}:/data -v \"~/.aws:~/.aws\" -w /data --rm pdfix/autotag-textract:latest --name \"${license_name}\" --key \"${license_key}\" autotag -i \"/data/${input_pdf}\" -o \"/data/${output_pdf}\"", | ||
"args": [ | ||
{ | ||
"name": "input_pdf", | ||
"desc": "Input PDF file", | ||
"flags": 2, | ||
"type": "file_path", | ||
"ext": "pdf", | ||
"value": "" | ||
}, | ||
{ | ||
"name": "output_pdf", | ||
"desc": "Output PDF file", | ||
"flags": 4, | ||
"type": "file_path", | ||
"ext": "pdf", | ||
"value": "" | ||
} | ||
] | ||
} | ||
] | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
amazon-textract-caller==0.2.4 | ||
amazon-textract-response-parser==1.0.3 | ||
amazon-textract-textractor==1.8.5 | ||
boto3==1.36.17 | ||
botocore==1.36.17 | ||
certifi==2025.1.31 | ||
charset-normalizer==3.4.1 | ||
editdistance==0.8.1 | ||
idna==3.10 | ||
jmespath==1.0.1 | ||
marshmallow==3.26.1 | ||
numpy==2.2.2 | ||
opencv-python==4.11.0.86 | ||
packaging==24.2 | ||
pdfix-sdk==8.4.3 | ||
pillow==11.1.0 | ||
python-dateutil==2.9.0.post0 | ||
requests==2.32.3 | ||
s3transfer==0.11.2 | ||
six==1.17.0 | ||
tabulate==0.9.0 | ||
tqdm==4.67.1 | ||
urllib3==2.3.0 | ||
XlsxWriter==3.2.2 |
Oops, something went wrong.