-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_splitter.py
127 lines (90 loc) · 4.44 KB
/
pdf_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import PyPDF2
import argparse
from tqdm import tqdm
def create_output_dir(output_dir):
"""Create the output directory if it doesn't exist."""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def parse_page_ranges(range_str):
"""Parse a comma-separated page range string into a list of tuples."""
page_ranges = []
for range_item in range_str.split(','):
try:
if '-' in range_item:
start, end = map(int, range_item.split('-'))
page_ranges.append((start - 1, end))
else:
page = int(range_item)
page_ranges.append((page - 1, page))
except ValueError:
print(f"Invalid range format: {range_item}. Skipping.")
continue
return page_ranges
def validate_page_range(start, end, total_pages):
"""Validate if the page range is within bounds."""
if start < 0 or end > total_pages or start >= end:
return False
return True
def write_log(log, message):
"""Write a message to the log file if it's provided."""
if log:
log.write(message + '\n')
print(message)
def split_pdf(input_pdf_path, output_dir, page_ranges, overwrite=False, output_filename=None, log_file=None):
"""
Splits a large PDF into smaller PDFs based on the given page ranges.
"""
try:
with open(input_pdf_path, 'rb') as input_pdf:
pdf_reader = PyPDF2.PdfReader(input_pdf)
total_pages = len(pdf_reader.pages)
log = open(log_file, 'a') if log_file else None
write_log(log, f"Splitting {input_pdf_path} into smaller files:")
for idx, (start, end) in tqdm(enumerate(page_ranges), desc="Processing Pages", total=len(page_ranges)):
if not validate_page_range(start, end, total_pages):
write_log(log, f"Invalid page range {start}-{end}. Skipping.")
continue
pdf_writer = PyPDF2.PdfWriter()
for page_num in range(start, end):
pdf_writer.add_page(pdf_reader.pages[page_num])
output_pdf_path = generate_output_filename(output_dir, idx, start, end, output_filename)
if os.path.exists(output_pdf_path) and not overwrite:
write_log(log, f"File {output_pdf_path} already exists. Skipping...")
continue
with open(output_pdf_path, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
write_log(log, f"Created {output_pdf_path}")
if log:
log.close()
except Exception as e:
error_message = f"Error processing {input_pdf_path}: {e}"
print(error_message)
if log:
log.write(error_message + '\n')
if log:
log.close()
def generate_output_filename(output_dir, idx, start, end, output_filename=None):
"""Generate the output PDF filename based on the custom format or default format."""
if output_filename:
return os.path.join(output_dir, output_filename.format(idx + 1, start + 1, end))
return os.path.join(output_dir, f"split_{idx + 1}_{start + 1}_{end}.pdf")
def parse_args():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description="Split a large PDF file into smaller parts by page range.")
parser.add_argument('input_pdf', help="Path to the input PDF file to split.")
parser.add_argument('output_dir', help="Directory to save the split PDF files.")
parser.add_argument('--ranges', type=str, required=True, help="Comma-separated page ranges, e.g. '1-5,6-10' or single pages '1,3,5'.")
parser.add_argument('--output_filename', type=str, default=None,
help="Custom output filename format, e.g. 'output_{0}_{1}_{2}.pdf'.")
parser.add_argument('--overwrite', action='store_true', help="Overwrite existing files.")
parser.add_argument('--log_file', type=str, default=None, help="Log file to record created PDFs.")
return parser.parse_args()
def main():
"""Main function to handle argument parsing and PDF splitting."""
args = parse_args()
create_output_dir(args.output_dir)
page_ranges = parse_page_ranges(args.ranges)
split_pdf(args.input_pdf, args.output_dir, page_ranges, args.overwrite, args.output_filename, args.log_file)
if __name__ == "__main__":
main()