-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextraxt_content.py
144 lines (112 loc) · 4.31 KB
/
extraxt_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# import fitz # PyMuPDF
# import os
# from sqlalchemy.orm import Session
# from app.database import SessionLocal
# from app import crud, schemas
# # Define paths
# pdf_path = "/home/adesoji/Downloads/DO-YOU-WONDER-ABOUT-RAIN-SNOW-SLEET-AND-HAIL-Free-Childrens-Book-By-Monkey-Pen.pdf"
# output_dir = "extracted_content"
# os.makedirs(output_dir, exist_ok=True)
# def extract_text_and_images_from_pdf(pdf_path, output_dir):
# doc = fitz.open(pdf_path)
# book_pages = []
# for page_num in range(len(doc)):
# page = doc[page_num]
# text = page.get_text()
# # Extract images
# image_list = page.get_images(full=True)
# image_files = []
# for img_index, img in enumerate(image_list):
# xref = img[0]
# base_image = doc.extract_image(xref)
# image_bytes = base_image["image"]
# image_ext = base_image["ext"]
# image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
# image_filepath = os.path.join(output_dir, image_filename)
# with open(image_filepath, "wb") as img_file:
# img_file.write(image_bytes)
# image_files.append(image_filepath)
# book_pages.append({
# "page_number": page_num + 1,
# "content": text,
# "image_urls": image_files
# })
# return book_pages
# # Extract content
# book_pages = extract_text_and_images_from_pdf(pdf_path, output_dir)
# # Define book metadata
# book_data = schemas.BookCreate(
# title="DO YOU WONDER ABOUT RAIN, SNOW, SLEET, AND HAIL",
# author="T. Albert",
# publisher="Monkey Pen Ltd",
# description="A children's book explaining the water cycle and different forms of precipitation.",
# pages=[]
# )
# # Add extracted pages to book data
# for page in book_pages:
# book_page = schemas.BookPage(
# page_number=page["page_number"],
# content=page["content"],
# image_url=page["image_urls"][0] if page["image_urls"] else None
# )
# book_data.pages.append(book_page)
# # Save to database
# db: Session = SessionLocal()
# crud.create_book(db=db, book=book_data)
import fitz # PyMuPDF
import os
from sqlalchemy.orm import Session
from app.database import SessionLocal, engine, Base
from app import crud, schemas
# Create the database tables
Base.metadata.create_all(bind=engine)
# Define paths
pdf_path = "bookpdf/DO-YOU-WONDER-ABOUT-RAIN-SNOW-SLEET-AND-HAIL-Free-Childrens-Book-By-Monkey-Pen.pdf"
output_dir = "extracted_content"
os.makedirs(output_dir, exist_ok=True)
def extract_text_and_images_from_pdf(pdf_path, output_dir):
doc = fitz.open(pdf_path)
book_pages = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
# Extract images
image_list = page.get_images(full=True)
image_files = []
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
image_filepath = os.path.join(output_dir, image_filename)
with open(image_filepath, "wb") as img_file:
img_file.write(image_bytes)
image_files.append(image_filepath)
book_pages.append({
"page_number": page_num + 1,
"content": text,
"image_urls": image_files
})
return book_pages
# Extract content
book_pages = extract_text_and_images_from_pdf(pdf_path, output_dir)
# Define book metadata
book_data = schemas.BookCreate(
title="DO YOU WONDER ABOUT RAIN, SNOW, SLEET, AND HAIL",
author="T. Albert",
publisher="Monkey Pen Ltd",
description="A children's book explaining the water cycle and different forms of precipitation.",
pages=[]
)
# Add extracted pages to book data
for page in book_pages:
book_page = schemas.BookPage(
page_number=page["page_number"],
content=page["content"],
image_url=page["image_urls"][0] if page["image_urls"] else None
)
book_data.pages.append(book_page)
# Save to database
db: Session = SessionLocal()
crud.create_book(db=db, book=book_data)