diff --git a/models/dt.joblib b/models/dt.joblib index 5aa9d22..04cc2a9 100644 Binary files a/models/dt.joblib and b/models/dt.joblib differ diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 75bfd37..b91726a 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -74,7 +74,7 @@ def get_pdfium_chars(pdf_path, fontname_sample_freq=settings.FONTNAME_SAMPLE_FRE rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i) rotation = rotation * 180 / math.pi # convert from radians to degrees - coords = text_page.get_charbox(i, loose=False) + coords = text_page.get_charbox(i, loose=True) device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, page_rotation, normalize=True) char_info = { diff --git a/pyproject.toml b/pyproject.toml index 2654c0e..ac1f78b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.2.0" +version = "0.2.1" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"