initial commit

pdfix · Feb 11, 2025 · 2185c12 · 2185c12
1 parent fb54b7c
commit 2185c12
Show file tree

Hide file tree

Showing 12 changed files with 624 additions and 2 deletions.
diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml
@@ -0,0 +1,16 @@
+name: Code style
+on: [push, pull_request]
+jobs:
+  code-style:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+      - name: Check formatting
+        uses: chartboost/ruff-action@v1
+        with:
+          args: 'format --check'
+      - name: Check coding style
+        uses: chartboost/ruff-action@v1
+        with:
+          args: 'check --select F,E,W,I,PLW'
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -0,0 +1,90 @@
+name: Build and Publish Docker Image
+
+on:
+  push:
+    # branches:
+    #   - main
+    #   - dev
+    tags:
+      - 'v*.*.*'  # Matches version tags like v1.0.0
+  workflow_dispatch:
+
+env:
+  DOCKER_HUB_NAMESPACE: pdfix
+  DOCKER_HUB_REPOSITORY: autotag-textract
+
+jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+      attestations: write
+      id-token: write
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Set Docker tag
+        id: vars
+        run: |
+          if [[ $GITHUB_EVENT_NAME == 'workflow_dispatch' || $GITHUB_REF == refs/heads/main ]]; then
+            echo "tag=latest" >> $GITHUB_ENV
+          else
+            echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
+          fi
+      - name: Update config.json version
+        run: chmod +x update_version.sh && ./update_version.sh ${{ env.tag }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}
+          tags: ${{ env.tag }}
+
+      - name: Run tests
+        run: chmod +x test.sh && ./test.sh
+
+      - name: Build and push Docker image
+        id: push
+        uses: docker/build-push-action@v6
+        with:
+          platforms: linux/amd64 ,linux/arm64
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
+      # - name: Generate artifact attestation
+      #   uses: actions/attest-build-provenance@v1
+      #   with:
+      #     subject-name: index.docker.io/${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}:${{ env.tag }}
+      #     subject-digest: ${{ steps.push.outputs.digest }}
+      #     push-to-registry: true
+
+      - name: Docker Hub Description
+        uses: peter-evans/dockerhub-description@v4
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+          repository: ${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}
+
+      - name: Upload to FTP
+        run: |
+          curl -T config.json ftp.pdfix.net/update-service/v1/actions/${{ env.DOCKER_HUB_NAMESPACE }}/${{ env.DOCKER_HUB_REPOSITORY }}/config.json --user "${{ secrets.FTP_USERNAME }}:${{ secrets.FTP_PASSWORD }}" --ftp-create-dirs
+
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+.vscode/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,29 @@
+# Use the official Debian slim image as a base
+FROM debian:stable-slim
+
+# Install dependencies
+RUN apt-get update && \
+    apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-venv \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /usr/autotag/
+
+ENV VIRTUAL_ENV=venv
+
+# Create a virtual environment and install dependencies
+RUN python3 -m venv venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Copy and install dependencies into the container
+COPY requirements.txt /usr/autotag/
+RUN pip install --no-cache-dir -r requirements.txt 
+
+# Copy sources and resources
+COPY config.json /usr/autotag/
+COPY src/ /usr/autotag/src/
+
+ENTRYPOINT ["/usr/lang-detect/venv/bin/python3", "/usr/autotag/src/main.py"]
diff --git a/README.md b/README.md
@@ -1,2 +1,54 @@
-# action-autotag-textract-docker
-Autotag PDF documents using AWS Textract Layout Model in Docker
+# Autotag Textract
+
+A Docker image that automatically tags a PDF file using AWS Textract.
+
+## Table of Contents
+
+- [Autotag Textract](#autotag-textract)
+  - [Table of Contents](#table-of-contents)
+  - [Getting Started](#getting-started)
+  - [Run using Command Line Interface](#run-using-command-line-interface)
+  - [Run OCR using REST API](#run-ocr-using-rest-api)
+    - [Exporting Configuration for Integration](#exporting-configuration-for-integration)
+  - [License](#license)
+  - [Help \& Support](#help--support)
+
+## Getting Started
+
+To use this Docker application, you'll need to have Docker installed on your system. If Docker is not installed, please follow the instructions on the [official Docker website](https://docs.docker.com/get-docker/) to install it.
+
+
+## Run using Command Line Interface
+
+To run docker container as CLI you should share the folder with PDF to process using `-i` parameter. In this example it's current folder.
+
+```bash
+docker run -v $(pwd):/data -w /data --rm pdfix/autotag-textract:latest autotag -i input.pdf -o output.pdf
+```
+
+The first run will pull the docker image, which may take some time. Make your own image for more advanced use.
+
+For more detailed information about the available command-line arguments, you can run the following command:
+
+```bash
+docker run --rm pdfix/autotag-textract:latest --help
+```
+
+## Run OCR using REST API
+Comming soon. Please contact us.
+
+### Exporting Configuration for Integration
+To export the configuration JSON file, use the following command:
+```bash
+docker run -v $(pwd):/data -w /data --rm pdfix/autotag-textract:latest config -o config.json
+```
+
+## License
+- PDFix license https://pdfix.net/terms
+- AWS Textract 
+
+The trial version of the PDFix SDK may apply a watermark on the page and redact random parts of the PDF including the scanned image in the background. Contact us to get an evaluation or production license.
+
+## Help & Support
+To obtain a PDFix SDK license or report an issue please contact us at support@pdfix.net.
+For more information visit https://pdfix.net
diff --git a/config.json b/config.json
@@ -0,0 +1,31 @@
+{
+    "version": "v0.0.0",
+    "actions": [
+        {
+            "name": "Autotag Textract",
+            "desc": "Automatically tag PDF using AWS Textract",
+            "version": "v0.0.0",
+            "icon": "autotag_textract",
+            "category": "Metadata",
+            "program": "docker run --platform -v ${working_directory}:/data -v \"~/.aws:~/.aws\" -w /data --rm pdfix/autotag-textract:latest --name \"${license_name}\" --key \"${license_key}\" autotag -i \"/data/${input_pdf}\" -o \"/data/${output_pdf}\"",
+            "args": [
+                {
+                    "name": "input_pdf",
+                    "desc": "Input PDF file",
+                    "flags": 2,
+                    "type": "file_path",
+                    "ext": "pdf",
+                    "value": ""
+                },
+                {
+                    "name": "output_pdf",
+                    "desc": "Output PDF file",
+                    "flags": 4,
+                    "type": "file_path",
+                    "ext": "pdf",
+                    "value": ""
+                }
+            ]
+        }
+    ]
+}
diff --git a/examples/1_tables.pdf b/examples/1_tables.pdf
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,24 @@
+amazon-textract-caller==0.2.4
+amazon-textract-response-parser==1.0.3
+amazon-textract-textractor==1.8.5
+boto3==1.36.17
+botocore==1.36.17
+certifi==2025.1.31
+charset-normalizer==3.4.1
+editdistance==0.8.1
+idna==3.10
+jmespath==1.0.1
+marshmallow==3.26.1
+numpy==2.2.2
+opencv-python==4.11.0.86
+packaging==24.2
+pdfix-sdk==8.4.3
+pillow==11.1.0
+python-dateutil==2.9.0.post0
+requests==2.32.3
+s3transfer==0.11.2
+six==1.17.0
+tabulate==0.9.0
+tqdm==4.67.1
+urllib3==2.3.0
+XlsxWriter==3.2.2