Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jozefbaranec committed Feb 11, 2025
1 parent fb54b7c commit 2185c12
Show file tree
Hide file tree
Showing 12 changed files with 624 additions and 2 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/code-style.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Code style
on: [push, pull_request]
jobs:
code-style:
runs-on: ubuntu-latest
steps:
- name: Check out the repo
uses: actions/checkout@v4
- name: Check formatting
uses: chartboost/ruff-action@v1
with:
args: 'format --check'
- name: Check coding style
uses: chartboost/ruff-action@v1
with:
args: 'check --select F,E,W,I,PLW'
90 changes: 90 additions & 0 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: Build and Publish Docker Image

on:
push:
# branches:
# - main
# - dev
tags:
- 'v*.*.*' # Matches version tags like v1.0.0
workflow_dispatch:

env:
DOCKER_HUB_NAMESPACE: pdfix
DOCKER_HUB_REPOSITORY: autotag-textract

jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ubuntu-latest
permissions:
packages: write
contents: read
attestations: write
id-token: write
steps:
- name: Check out the repo
uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Set Docker tag
id: vars
run: |
if [[ $GITHUB_EVENT_NAME == 'workflow_dispatch' || $GITHUB_REF == refs/heads/main ]]; then
echo "tag=latest" >> $GITHUB_ENV
else
echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
fi
- name: Update config.json version
run: chmod +x update_version.sh && ./update_version.sh ${{ env.tag }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}
tags: ${{ env.tag }}

- name: Run tests
run: chmod +x test.sh && ./test.sh

- name: Build and push Docker image
id: push
uses: docker/build-push-action@v6
with:
platforms: linux/amd64 ,linux/arm64
context: .
file: ./Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

# - name: Generate artifact attestation
# uses: actions/attest-build-provenance@v1
# with:
# subject-name: index.docker.io/${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}:${{ env.tag }}
# subject-digest: ${{ steps.push.outputs.digest }}
# push-to-registry: true

- name: Docker Hub Description
uses: peter-evans/dockerhub-description@v4
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
repository: ${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}

- name: Upload to FTP
run: |
curl -T config.json ftp.pdfix.net/update-service/v1/actions/${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}/config.json --user "${{ secrets.FTP_USERNAME }}:${{ secrets.FTP_PASSWORD }}" --ftp-create-dirs
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
29 changes: 29 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Use the official Debian slim image as a base
FROM debian:stable-slim

# Install dependencies
RUN apt-get update && \
apt-get install -y \
python3 \
python3-pip \
python3-venv \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /usr/autotag/

ENV VIRTUAL_ENV=venv

# Create a virtual environment and install dependencies
RUN python3 -m venv venv
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Copy and install dependencies into the container
COPY requirements.txt /usr/autotag/
RUN pip install --no-cache-dir -r requirements.txt

# Copy sources and resources
COPY config.json /usr/autotag/
COPY src/ /usr/autotag/src/

ENTRYPOINT ["/usr/lang-detect/venv/bin/python3", "/usr/autotag/src/main.py"]
56 changes: 54 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,54 @@
# action-autotag-textract-docker
Autotag PDF documents using AWS Textract Layout Model in Docker
# Autotag Textract

A Docker image that automatically tags a PDF file using AWS Textract.

## Table of Contents

- [Autotag Textract](#autotag-textract)
- [Table of Contents](#table-of-contents)
- [Getting Started](#getting-started)
- [Run using Command Line Interface](#run-using-command-line-interface)
- [Run OCR using REST API](#run-ocr-using-rest-api)
- [Exporting Configuration for Integration](#exporting-configuration-for-integration)
- [License](#license)
- [Help \& Support](#help--support)

## Getting Started

To use this Docker application, you'll need to have Docker installed on your system. If Docker is not installed, please follow the instructions on the [official Docker website](https://docs.docker.com/get-docker/) to install it.


## Run using Command Line Interface

To run docker container as CLI you should share the folder with PDF to process using `-i` parameter. In this example it's current folder.

```bash
docker run -v $(pwd):/data -w /data --rm pdfix/autotag-textract:latest autotag -i input.pdf -o output.pdf
```

The first run will pull the docker image, which may take some time. Make your own image for more advanced use.

For more detailed information about the available command-line arguments, you can run the following command:

```bash
docker run --rm pdfix/autotag-textract:latest --help
```

## Run OCR using REST API
Comming soon. Please contact us.

### Exporting Configuration for Integration
To export the configuration JSON file, use the following command:
```bash
docker run -v $(pwd):/data -w /data --rm pdfix/autotag-textract:latest config -o config.json
```

## License
- PDFix license https://pdfix.net/terms
- AWS Textract

The trial version of the PDFix SDK may apply a watermark on the page and redact random parts of the PDF including the scanned image in the background. Contact us to get an evaluation or production license.

## Help & Support
To obtain a PDFix SDK license or report an issue please contact us at support@pdfix.net.
For more information visit https://pdfix.net
31 changes: 31 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"version": "v0.0.0",
"actions": [
{
"name": "Autotag Textract",
"desc": "Automatically tag PDF using AWS Textract",
"version": "v0.0.0",
"icon": "autotag_textract",
"category": "Metadata",
"program": "docker run --platform -v ${working_directory}:/data -v \"~/.aws:~/.aws\" -w /data --rm pdfix/autotag-textract:latest --name \"${license_name}\" --key \"${license_key}\" autotag -i \"/data/${input_pdf}\" -o \"/data/${output_pdf}\"",
"args": [
{
"name": "input_pdf",
"desc": "Input PDF file",
"flags": 2,
"type": "file_path",
"ext": "pdf",
"value": ""
},
{
"name": "output_pdf",
"desc": "Output PDF file",
"flags": 4,
"type": "file_path",
"ext": "pdf",
"value": ""
}
]
}
]
}
Binary file added examples/1_tables.pdf
Binary file not shown.
24 changes: 24 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
amazon-textract-caller==0.2.4
amazon-textract-response-parser==1.0.3
amazon-textract-textractor==1.8.5
boto3==1.36.17
botocore==1.36.17
certifi==2025.1.31
charset-normalizer==3.4.1
editdistance==0.8.1
idna==3.10
jmespath==1.0.1
marshmallow==3.26.1
numpy==2.2.2
opencv-python==4.11.0.86
packaging==24.2
pdfix-sdk==8.4.3
pillow==11.1.0
python-dateutil==2.9.0.post0
requests==2.32.3
s3transfer==0.11.2
six==1.17.0
tabulate==0.9.0
tqdm==4.67.1
urllib3==2.3.0
XlsxWriter==3.2.2
Loading

0 comments on commit 2185c12

Please sign in to comment.