-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_convertpdf_tojpg.py
54 lines (43 loc) · 1.71 KB
/
_convertpdf_tojpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests, re, json, time, random, csv, os, datetime, lxml, glob, cv2, imutils, subprocess, pytesseract
from bs4 import BeautifulSoup as bs
from PIL import Image
from pytesseract import image_to_string
from pdf2image import convert_from_path
import multiprocessing as mp
from tqdm import tqdm
def write_json_tofile(response,outfilename):
with open(outfilename,"a") as f:
f.write(json.dumps(response)+'\n')
def read_jsoncsv(fname):
with open(fname,'r') as f:
jobs = [json.loads(x) for x in f.readlines()]
return jobs
def convert_tojpeg(pdffiles,imageoutfilepath):
for i, pdffile in enumerate(pdffiles):
pdfoutfilename = "pdfimage_"+str(pdffile.split('/')[-1].split('.pdf',1)[0])
imageoutfolder = imageoutfilepath + pdfoutfilename + '/'
os.mkdir(imageoutfolder)
pages = convert_from_path(pdffile, 600)
print(len(pages))
for n, page in enumerate(pages):
pimageoutfile = imageoutfolder+'page_'+str(n)+'_of_'+str(len(pages))+'__'+pdfoutfilename+'.jpg'
page.save(pimageoutfile, 'JPEG')
def update(*a):
pbar.update()
mainpath = str(os.getcwd())+'/'
imageoutfilepath = mainpath + 'data/images/'
pdffiles = glob.glob(mainpath +'data/pdfs/*.pdf')
print(len(pdffiles))
# convert_tojpeg(pdffiles,imageoutfilepath)
n = 100
outfilechunks = [pdffiles[i * n:(i + 1) * n] for i in range((len(pdffiles) + n - 1) // n )]
print(len(outfilechunks))
pool = mp.Pool(processes=8)
pbar = tqdm(outfilechunks)
for i in range(pbar.total):
pool.apply_async(convert_tojpeg, args=(outfilechunks[i],imageoutfilepath), callback=update)
pool.close()
pool.join()